-
Notifications
You must be signed in to change notification settings - Fork 480
Description
Description
The Python code below causes DBBCSD to compute inaccurate left singular vectors.
Given the input matrices X11, X21 of the DBBCSD caller DORCSD2BY1, DORCSD2BY1 computes matrices such that
X11 = U1 D1 V^*
X21 = U2 D2 V^*
where
X11^* X11 + X21^* X21 = I
U1^* U1 = I
U2^* U2 = I
D1^* D1 + D2^* D2 = I`.
V^* V = I
In the example code below U2 is inaccurate, whereas all other matrices are computed with sufficient accuracy:
‖X11 - U1 D1 V^*‖ ≤ 4 ε ‖X11‖
‖X21 - U2 D2 V^*‖ ≈ 50 ε ‖X21‖
The issue was pinned on DBBCSD after stepping through the code with GDB and reducing the threshold value below which off-diagonal entries are considered zero by the Golub-Reinsch-SVD-like algorithm; this fixes the problem.
The issue is present in at least two tested LAPACK releases:
- the current master branch code,
- the LAPACK code in OpenBLAS 0.3.21 shipping with Debian 12.
The demo code below requires
- a Python3 interpreter,
- NumPy
- SciPy
- a LAPACK installation.
Example (the pre-processing step reduces the number of rows of X11 and makes iterative computation of the singular values superfluous; this allows one to diagnose DBBCSD as the source of the observed inaccuracies):
$ python3 dbbcsd-nonconvergence-demo.py
Norm-wise difference A, re-assembled A: 4 eps
Norm-wise difference B, re-assembled B: 50 eps ✘
Norm-wise difference A, re-assembled A with pre-processing: 1 eps
Norm-wise difference B, re-assembled B with pre-processing: 1 eps#!/usr/bin/python3
# This file demonstrates the inaccurate computation of the buttom left-hand
# matrix U2 in a CS decomposition by DORCSD2BY1, where
# |U1 | |D1| V1^* = |X11|
# | U2| |D2| |X21|
#
# The problem was found while working on the generalized singular value
# decomposition which can be computed by a QR followed by a 2x1 CS
# decomposition. For this reason, the demo code in the `main` function is
# structured as follows:
# 1. Define matrices A, B for which you want to compute the GSVD
# 2. Compute the QR, followed by the 2x1 CS decomposition yielding
# A = U1 D1 X
# B = U2 D2 X
# The problem is that `norm(B - U2 D2 X)` is way too large (~50 ε norm(B)).
# 3. Pre-process A with a QR decomposition with column pivoting to find the
# full rank part (let us call this part `R_A`).
# 4. Compute the GSVD for (R_A, B), get matrices with suffix `_h`.
# 5. It will hold the following matrices are small in norm implying a problem
# with the factor U2 computed by the 2x1 CS decomposition:
# A - U1 D1 X
# A - U1_h D1_h X_h
# B - U2_h D2_h X_h
# D1 - D1_h
# D2 - D2_h
# X - diag(1, -1) X_h (meaning X, X_h are near-identical up to the signs)
import array
import ctypes
import ctypes.util
from ctypes import byref, c_char, c_double, c_int32, c_size_t, c_void_p, POINTER
import sys
from typing import Optional
import numpy as np
import scipy.linalg as la
def get_lapack_library(hint: Optional[str]) -> ctypes.CDLL:
if hint is None:
lapack_library = ctypes.util.find_library("lapack")
else:
lapack_library = hint
lapack = ctypes.CDLL(lapack_library, use_errno=True)
# an alias for the _F_ortran integer type
f_int = ctypes.c_int32
lapack.dorcsd2by1_.restype = None
lapack.dorcsd2by1_.argtypes = [
POINTER(c_char),
POINTER(c_char),
POINTER(c_char),
# M
POINTER(f_int),
# N
POINTER(f_int),
# P
POINTER(f_int),
# X11
POINTER(c_double),
POINTER(f_int),
# X21
POINTER(c_double),
POINTER(f_int),
# theta
POINTER(c_double),
# U1
POINTER(c_double),
POINTER(f_int),
# U2
POINTER(c_double),
POINTER(f_int),
# V^T
POINTER(c_double),
POINTER(f_int),
# work
POINTER(c_double),
POINTER(f_int),
POINTER(f_int),
POINTER(f_int),
POINTER(c_size_t),
POINTER(c_size_t),
POINTER(c_size_t),
]
return lapack
def invert_permutation(π):
inv = π.copy()
for i in range(len(inv)):
inv[π[i]] = i
return inv
def main():
if len(sys.argv) not in [1, 2, 3]:
return "usage: python3 %s [path to LAPACK library]" % (sys.argv[0], )
if len(sys.argv) >= 2:
lapack_library = sys.argv[1]
else:
lapack_library = None
lapack = get_lapack_library(lapack_library)
yes = byref(c_char(b'y'))
f_int = ctypes.c_int32
intref = lambda n: byref(f_int(n))
f64ref = lambda array: (c_double * len(array)).from_buffer(array)
nan = float("NaN")
# input matrices
a = np.array([[
+6.74929253732324819e-02,
+1.22859375733411061e-01,
+5.07325941117727705e-02,
],
[
-8.12706370462513289e-02,
-1.47939353313599864e-01,
-6.10889247972628419e-02,
],
[
+4.60489625514949097e-01,
+8.38242935976169390e-01,
+3.46137511965030342e-01,
]],
order="F")
# Make the norm of B similar to the norm of A because the underlying
# problem is unrelated to matrix scaling.
w = np.ldexp(1, -20)
b = w * np.array([[
-5.10454736289108230e+05,
+4.58049065850535840e+02,
+6.52789076380331302e+05,
],
[
-6.85877465731282020e+05,
+6.15462077459372153e+02,
+8.77126384642258170e+05,
],
[
-2.12078129698660923e+05,
+1.90305197065533122e+02,
+2.71213638672229019e+05,
]],
order="F")
assert la.norm(a) <= 2 * la.norm(b)
assert la.norm(b) <= 2 * la.norm(a)
# compute QR+CS decomposition
g = np.vstack([a, b])
q, r, π = la.qr(g, pivoting=True, mode="economic")
eps = np.finfo(g.dtype).eps
rank = np.sum(abs(np.diag(r)) >= eps * la.norm(g))
assert rank == 2
# Compute 2-by-1 CS decomposition (there is no SciPy low-level routine yet)
m = len(q)
n = rank
p = len(a)
# Convert Q to C-contiguous NumPy array (for Python package array) in
# Fortran order (for LAPACK)
x11 = np.ascontiguousarray(q[:p].T.copy())
x21 = np.ascontiguousarray(q[p:].T.copy())
theta = array.array("d", [nan] * n)
u1 = array.array("d", [nan] * (p * p))
u2 = array.array("d", [nan] * ((m - p) * (m - p)))
v1t = array.array("d", [nan] * (n * n))
work = array.array("d", [nan] * 256)
iwork = array.array("i", [0] * (m + n))
info = f_int(0)
lapack.dorcsd2by1_(
yes,
yes,
yes,
intref(m),
intref(p),
intref(n),
f64ref(x11),
intref(p),
f64ref(x21),
intref(m - p),
f64ref(theta),
f64ref(u1),
intref(p),
f64ref(u2),
intref(m - p),
f64ref(v1t),
intref(n),
f64ref(work),
intref(len(work)),
(ctypes.c_int * len(iwork)).from_buffer(iwork),
byref(info),
byref(c_size_t(1)),
byref(c_size_t(1)),
byref(c_size_t(1)),
)
u1 = np.array(u1).reshape(p, p).T
u2 = np.array(u2).reshape(m - p, m - p).T
d1 = np.full((p, rank), 0, dtype=g.dtype)
d1[0, 0] = np.cos(theta[0])
d1[1, 1] = np.cos(theta[1])
d2 = np.full((m - p, rank), 0, dtype=g.dtype)
d2[1, 0] = np.sin(theta[0])
d2[2, 1] = np.sin(theta[1])
v1t = np.array(v1t).reshape(n, rank).T
x = v1t @ r[:rank][:, invert_permutation(π)]
a_reassembled = u1 @ d1 @ x
b_reassembled = u2 @ d2 @ x
delta_a = la.norm(a - a_reassembled) / (eps * la.norm(a))
delta_b = la.norm(b - b_reassembled) / (eps * la.norm(b))
print("Norm-wise difference A, re-assembled A: %.0f eps" % (delta_a, ))
print("Norm-wise difference B, re-assembled B: %.0f eps ✘" % (delta_b, ))
assert delta_a <= 8
assert delta_b > 32
# Retry the CS decomposition with a pre-processed matrix A
# Pre-process A:
qa, ra, πa = la.qr(a, pivoting=True, mode="economic")
rank_a = np.sum(abs(np.diag(ra)) >= eps * la.norm(a))
assert rank_a == 1
# compute QR+CS decomposition
h = np.vstack([ra[:rank_a, invert_permutation(πa)], b])
q_h, r_h, π_h = la.qr(h, pivoting=True, mode="economic")
rank_h = np.sum(abs(np.diag(r_h)) >= eps * la.norm(h))
assert rank_h == rank
# Compute 2-by-1 CS decomposition again
#
# DBBCSD does not have to iteratively compute singular values because X11
# was reduced to a single row. Thus, its input will be near-diagonal
# instead of bidiagonal matrices.
m = rank_a + len(b)
p = rank_a
n = rank_h
x11_h = np.ascontiguousarray(q_h[:p].T.copy())
x21_h = np.ascontiguousarray(q_h[p:].T.copy())
theta_h = array.array("d", [nan] * n)
u1_h = array.array("d", [nan] * (p * p))
u2_h = array.array("d", [nan] * ((m - p) * (m - p)))
v1t_h = array.array("d", [nan] * (n * n))
work = array.array("d", [nan] * 256)
iwork = array.array("i", [0] * (m + n))
info = f_int(0)
lapack.dorcsd2by1_(
yes,
yes,
yes,
intref(m),
intref(p),
intref(n),
f64ref(x11_h),
intref(p),
f64ref(x21_h),
intref(m - p),
f64ref(theta_h),
f64ref(u1_h),
intref(p),
f64ref(u2_h),
intref(m - p),
f64ref(v1t_h),
intref(n),
f64ref(work),
intref(len(work)),
(ctypes.c_int * len(iwork)).from_buffer(iwork),
byref(info),
byref(c_size_t(1)),
byref(c_size_t(1)),
byref(c_size_t(1)),
)
assert np.array(u1_h) == 1
u1_h = qa
u2_h = np.array(u2_h).reshape(m - p, m - p).T
d1_h = np.full((len(a), rank_h), 0, dtype=h.dtype)
d1_h[0, 0] = np.cos(theta_h[0])
d1_h[1, 1] = 0
d2_h = np.full((m - p, rank_h), 0, dtype=h.dtype)
d2_h[1, 0] = np.sin(theta_h[0])
d2_h[2, 1] = 1
v1t_h = np.array(v1t_h).reshape(n, rank_h).T
x_h = v1t_h @ r_h[:rank_h][:, invert_permutation(π_h)]
a_h_reassembled = u1_h @ d1_h @ x_h
b_h_reassembled = u2_h @ d2_h @ x_h
delta_ah = la.norm(a - a_h_reassembled) / (eps * la.norm(a))
delta_bh = la.norm(b - b_h_reassembled) / (eps * la.norm(b))
print(
"Norm-wise difference A, re-assembled A with pre-processing: %.0f eps"
% (delta_ah, ))
print(
"Norm-wise difference B, re-assembled B with pre-processing: %.0f eps"
% (delta_bh, ))
assert delta_ah <= 4
assert delta_bh <= 4
if __name__ == "__main__":
sys.exit(main())Checklist
- I've included a minimal example to reproduce the issue
- I'd be willing to make a PR to solve this issue