Skip to content

Missed optimization: Unreachable branch not eliminated due to circular dependency #160001

@rluvaton

Description

@rluvaton

Consider the following rust example (simplified version of pushing to vector with sufficient capacity):

// using const for simplicity, it happen as well if getting it as argument
const COUNT: usize = 10000;

#[no_mangle]
pub fn run() -> usize {
    let mut inter = 0;
    let mut my_cap = COUNT;

    for _ in 0..COUNT {
        let len = inter;

        if len == my_cap {
            something(len);
        }

        inter += 1;
    }

    return inter;
}

#[inline(never)]
fn something(len: usize) {
    std::hint::black_box(len as i128);
}

pub fn main() {
    std::hint::black_box(run());
}

Using rustc nightly with -C opt-level=3 -C target-feature=+avx2 -C codegen-units=1 -Z mir-opt-level=3 in GodBolt
you can see that

if len == my_cap {
    something(len);
}

Is optimized away.

example::main::h86544b4302037868:
        mov     qword ptr [rsp - 8], 10000
        lea     rax, [rsp - 8]
        ret

run:
        mov     eax, 10000
        ret

as the condition can never be reached.

however when changing the body inside the unreachable condition to be:

if len == my_cap {
    something(len);
    my_cap += 1;
}
code after change

const COUNT: usize = 10000;

#[no_mangle]
pub fn run() -> usize {
    let mut inter = 0;
    let mut my_cap = COUNT;

    for _ in 0..COUNT {
        let len = inter;

        if len == my_cap {
            something(len);
            my_cap += 1;
        }

        inter += 1;
    }

    return inter;
}

#[inline(never)]
fn something(len: usize) {
    std::hint::black_box(len as i128);
}

pub fn main() {
    std::hint::black_box(run());
}

You can see that the if is not optimized away

Generated assembly

example::main::h86544b4302037868:
        push    rbx
        sub     rsp, 16
        mov     eax, 10000
        xor     edi, edi
        jmp     .LBB0_1
.LBB0_3:
        mov     rdi, rbx
        cmp     rbx, 10000
        je      .LBB0_4
.LBB0_1:
        lea     rbx, [rdi + 1]
        cmp     rdi, rax
        jne     .LBB0_3
        call    example::something::h16dfb585d2622cee
        mov     rax, rbx
        jmp     .LBB0_3
.LBB0_4:
        mov     qword ptr [rsp + 8], 10000
        lea     rax, [rsp + 8]
        add     rsp, 16
        pop     rbx
        ret

example::something::h16dfb585d2622cee:
        mov     qword ptr [rsp - 24], rdi
        mov     qword ptr [rsp - 16], 0
        lea     rax, [rsp - 24]
        ret

run:
        push    rbx
        mov     eax, 10000
        xor     edi, edi
        jmp     .LBB2_1
.LBB2_3:
        mov     rdi, rbx
        cmp     rbx, 10000
        je      .LBB2_4
.LBB2_1:
        lea     rbx, [rdi + 1]
        cmp     rdi, rax
        jne     .LBB2_3
        call    example::something::h16dfb585d2622cee
        mov     rax, rbx
        jmp     .LBB2_3
.LBB2_4:
        mov     eax, 10000
        pop     rbx
        ret


I tried testing in C++ with x86-64 clang 21.1.0 and -O3 and same result.

(I don't really know cpp, so I tried my best to replicate the rust code)

the code:

#include <cstddef>
#include <cstdio>

const size_t COUNT = 10000;

__attribute__((noinline))
void something(size_t len) {
    printf("%d", len);
    (void)len;
}

extern "C" size_t run() {
    size_t inter = 0;
    size_t my_cap = COUNT;

    for (size_t i = 0; i < my_cap; ++i) {
        size_t len = inter;

        if (len == my_cap) {
            something(len);
            my_cap += 1;
        }

        inter += 1;
    }
    

    return inter;
}

int main() {
    volatile size_t result = run();
    (void)result;
    return 0;
}
Produce the following assembly:

something(unsigned long):
        mov     rsi, rdi
        lea     rdi, [rip + .L.str]
        xor     eax, eax
        jmp     printf@PLT

run:
        mov     rax, -25
.LBB1_1:
        add     rax, 25
        cmp     rax, 9975
        jb      .LBB1_1
        add     rax, 25
        ret

main:
        mov     qword ptr [rsp - 8], 10000
        mov     rax, qword ptr [rsp - 8]
        xor     eax, eax
        ret

.L.str:
        .asciz  "%d"

.Ldebug_list_header_start0:
        .short  5
        .byte   8
        .byte   0
        .long   3
.Ldebug_list_header_end0:

Using x86-64 gcc 15.2 with -O3 however do optimize it:

.LC0:
        .string "%d"
something(unsigned long):
        mov     rsi, rdi
        xor     eax, eax
        mov     edi, OFFSET FLAT:.LC0
        jmp     printf
run:
        mov     eax, 10000
        ret
main:
        mov     QWORD PTR [rsp-8], 10000
        mov     rax, QWORD PTR [rsp-8]
        xor     eax, eax
        ret

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions