Poor performance of C++ virtual inheritance

The following example illustrates the poor performance which can result when using C++ virtual inheritance.

Interface inheritance

This is with Microsoft Visual Studio 16.11.7 (Nov 2021) release x64 build with /O2 /GL switches.


struct Iw
{
    virtual void inc1() = 0;
    virtual void inc5() = 0;
};

struct Ix : public virtual Iw
{
    virtual void inc2() = 0;
};

struct Iy : public virtual Iw
{
    virtual void inc3() = 0;
};

struct Iz : public virtual Ix, public virtual Iy
{
    virtual void inc4() = 0;
};

// size = 88 bytes!
struct C : public Iz
{
    C() : x(0) {}
    virtual void inc1() { x += 1; }
    virtual void inc2() { inc1(); inc1(); }
    virtual void inc3() { inc2(); inc1(); }
    virtual void inc4() { inc3(); inc1(); }
    virtual void inc5() { inc4(); inc1(); }
    int x;
};

void run()
{
    C c;
    Iw* p = &c;

    // Rate is 52 MHz on Ryzen9 3900X Windows-x64
    for (int i=0 ; i < 1000000 ; ++i)
    {
        p->inc5();
    }
}

Machine code executed for the loop

The inefficiency is apparent from the machine code which is executed for the loop.


00007FF68ECF0A36  mov         rax,qword ptr [r15]  
00007FF68ECF0A39  mov         rcx,r15  
00007FF68ECF0A3C  call        qword ptr [rax+8]  
00007FF68ECA1253  jmp         InterfaceTimingTests::Cz2::inc5 (07FF68ECF1FD4h)  
00007FF68ECF1FD4  movsxd      rax,dword ptr [rcx-4]  
00007FF68ECF1FD8  sub         rcx,rax  
00007FF68ECF1FDB  jmp         InterfaceTimingTests::Cz2::inc5 (07FF68ECA2B8Ah)  
00007FF68ECA2B8A  jmp         InterfaceTimingTests::Cz2::inc5 (07FF68ECF1FF0h)  
00007FF68ECF1FF0  push        rbx  
00007FF68ECF1FF2  sub         rsp,20h  
00007FF68ECF1FF6  mov         rax,qword ptr [rcx-20h]  
00007FF68ECF1FFA  mov         rbx,rcx  
00007FF68ECF1FFD  add         rcx,0FFFFFFFFFFFFFFE0h  
00007FF68ECF2001  call        qword ptr [rax]
00007FF68ECA180C  jmp         InterfaceTimingTests::Cz2::inc4 (07FF68ECF1F70h)  
00007FF68ECF1F70  push        rbx  
00007FF68ECF1F72  sub         rsp,20h  
00007FF68ECF1F76  mov         rax,qword ptr [rcx+8]  
00007FF68ECF1F7A  mov         rbx,rcx  
00007FF68ECF1F7D  add         rcx,8  
00007FF68ECF1F81  movsxd      rdx,dword ptr [rax+0Ch]  
00007FF68ECF1F85  add         rcx,rdx  
00007FF68ECF1F88  mov         rax,qword ptr [rcx]  
00007FF68ECF1F8B  call        qword ptr [rax]  
00007FF68ECA32D3  jmp         InterfaceTimingTests::Cz2::inc3 (07FF68ECF1EF4h)  
00007FF68ECF1EF4  movsxd      rax,dword ptr [rcx-4]  
00007FF68ECF1EF8  sub         rcx,rax  
00007FF68ECF1EFB  jmp         InterfaceTimingTests::Cz2::inc3 (07FF68ECA4868h)  
00007FF68ECA4868  jmp         InterfaceTimingTests::Cz2::inc3 (07FF68ECF1F10h)  
00007FF68ECF1F10  push        rbx  
00007FF68ECF1F12  sub         rsp,20h  
00007FF68ECF1F16  mov         rax,qword ptr [rcx-40h]  
00007FF68ECF1F1A  mov         rbx,rcx  
00007FF68ECF1F1D  add         rcx,0FFFFFFFFFFFFFFC0h  
00007FF68ECF1F21  movsxd      rdx,dword ptr [rax+8]  
00007FF68ECF1F25  add         rcx,rdx  
00007FF68ECF1F28  mov         rax,qword ptr [rcx]  
00007FF68ECF1F2B  call        qword ptr [rax]
00007FF68ECA2B3F  jmp         InterfaceTimingTests::Cz2::inc2 (07FF68ECF1E84h)  
00007FF68ECF1E84  movsxd      rax,dword ptr [rcx-4]  
00007FF68ECF1E88  sub         rcx,rax  
00007FF68ECF1E8B  jmp         InterfaceTimingTests::Cz2::inc2 (07FF68ECA31D9h)  
00007FF68ECA31D9  jmp         InterfaceTimingTests::Cz2::inc2 (07FF68ECF1EA0h)  
00007FF68ECF1EA0  push        rbx  
00007FF68ECF1EA2  sub         rsp,20h  
00007FF68ECF1EA6  mov         rax,qword ptr [rcx-28h]  
00007FF68ECF1EAA  mov         rbx,rcx  
00007FF68ECF1EAD  add         rcx,0FFFFFFFFFFFFFFD8h  
00007FF68ECF1EB1  movsxd      rdx,dword ptr [rax+4]  
00007FF68ECF1EB5  add         rcx,rdx  
00007FF68ECF1EB8  mov         rax,qword ptr [rcx]  
00007FF68ECF1EBB  call        qword ptr [rax]  
00007FF68ECA2914  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E54h)  
00007FF68ECF1E54  movsxd      rax,dword ptr [rcx-4]  
00007FF68ECF1E58  sub         rcx,rax  
00007FF68ECF1E5B  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECA1E6Fh)  
00007FF68ECA1E6F  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E70h)  
00007FF68ECF1E70  inc         dword ptr [rcx-10h]  
00007FF68ECF1E73  ret  
00007FF68ECF1EBD  mov         rax,qword ptr [rbx-28h]  
00007FF68ECF1EC1  lea         rcx,[rbx-28h]  
00007FF68ECF1EC5  movsxd      rdx,dword ptr [rax+4]  
00007FF68ECF1EC9  add         rcx,rdx  
00007FF68ECF1ECC  mov         rax,qword ptr [rcx]  
00007FF68ECF1ECF  add         rsp,20h  
00007FF68ECF1ED3  pop         rbx  
00007FF68ECF1ED4  jmp         qword ptr [rax]  
00007FF68ECA2914  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E54h)  
00007FF68ECF1E54  movsxd      rax,dword ptr [rcx-4]  
00007FF68ECF1E58  sub         rcx,rax  
00007FF68ECF1E5B  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECA1E6Fh)  
00007FF68ECA1E6F  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E70h)  
00007FF68ECF1E70  inc         dword ptr [rcx-10h]  
00007FF68ECF1E73  ret  
00007FF68ECF1F2D  mov         rax,qword ptr [rbx-40h]  
00007FF68ECF1F31  lea         rcx,[rbx-40h]  
00007FF68ECF1F35  movsxd      rdx,dword ptr [rax+4]  
00007FF68ECF1F39  add         rcx,rdx  
00007FF68ECF1F3C  mov         rax,qword ptr [rcx]  
00007FF68ECF1F3F  add         rsp,20h  
00007FF68ECF1F43  pop         rbx  
00007FF68ECF1F44  jmp         qword ptr [rax]  
00007FF68ECA2914  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E54h)  
00007FF68ECF1E54  movsxd      rax,dword ptr [rcx-4]  
00007FF68ECF1E58  sub         rcx,rax  
00007FF68ECF1E5B  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECA1E6Fh)  
00007FF68ECA1E6F  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E70h)  
00007FF68ECF1E70  inc         dword ptr [rcx-10h]  
00007FF68ECF1E73  ret  
00007FF68ECF1F8D  mov         rax,qword ptr [rbx+8]  
00007FF68ECF1F91  lea         rcx,[rbx+8]  
00007FF68ECF1F95  movsxd      rdx,dword ptr [rax+4]  
00007FF68ECF1F99  add         rcx,rdx  
00007FF68ECF1F9C  mov         rax,qword ptr [rcx]  
00007FF68ECF1F9F  add         rsp,20h  
00007FF68ECF1FA3  pop         rbx  
00007FF68ECF1FA4  jmp         qword ptr [rax]  
00007FF68ECA2914  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E54h)  
00007FF68ECF1E54  movsxd      rax,dword ptr [rcx-4]  
00007FF68ECF1E58  sub         rcx,rax  
00007FF68ECF1E5B  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECA1E6Fh)  
00007FF68ECA1E6F  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E70h)  
00007FF68ECF1E70  inc         dword ptr [rcx-10h]  
00007FF68ECF1E73  ret  
00007FF68ECF2003  mov         rax,qword ptr [rbx-18h]  
00007FF68ECF2007  lea         rcx,[rbx-18h]  
00007FF68ECF200B  movsxd      rdx,dword ptr [rax+4]  
00007FF68ECF200F  add         rcx,rdx  
00007FF68ECF2012  mov         rax,qword ptr [rcx]  
00007FF68ECF2015  add         rsp,20h  
00007FF68ECF2019  pop         rbx  
00007FF68ECF201A  jmp         qword ptr [rax]  
00007FF68ECA2914  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E54h)  
00007FF68ECF1E54  movsxd      rax,dword ptr [rcx-4]  
00007FF68ECF1E58  sub         rcx,rax  
00007FF68ECF1E5B  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECA1E6Fh)  
00007FF68ECA1E6F  jmp         InterfaceTimingTests::Cz2::inc1 (07FF68ECF1E70h)  
00007FF68ECF1E70  inc         dword ptr [rcx-10h]  
00007FF68ECF1E73  ret  
00007FF68ECF0A3F  sub         rsi,1  
00007FF68ECF0A43  jne         InterfaceTimingTests::Run+916h (07FF68ECF0A36h)  

Same example using $interfaces

Consider the same example, except that we now use $interfaces


$interface+ Iw
{
    void inc1();
    void inc5();
};

$interface+ Ix : Iw
{
    void inc2();
};

$interface+ Iy : Iw
{
    void inc3();
};

$interface+ Iz : Ix, Iy
{
    void inc4();
};

// Size is 4 bytes
struct C
{
    C() : x(0) {}
    void inc1() { x += 1; }
    void inc2() { inc1(); inc1(); }
    void inc3() { inc2(); inc1(); }
    void inc4() { inc3(); inc1(); }
    void inc5() { inc4(); inc1(); }
    int x;
};

void run()
{
    C c;
    ptr<Iw> p = &c;

    // Rate is 4.4 GHz on Ryzen9 3900X Windows-x64
    for (int i=0 ; i < 1000000 ; ++i)
    {
        p->inc5();
    }
}

Note that the size of C is now 4 bytes (previously was 88 bytes).

Also since the struct C doesn't have any virtual functions or virtual inheritance, the compiler is much more likely to inline aggressively. In this case the loop executes in 1/85 of the time, and the following machine code executes in the loop:


again:  add         eax,5  
        sub         rcx,1  
        jne         again 

Generated code by Xcpp front end

The following is the generated C++ code for the interfaces Iw, Ix, Iy and Iz


struct Iw
{
    struct FnTable
    {
        void (*inc1)(void* _self);
    };
    Iw(void* _self, const FnTable* table) : m_self(_self), m_table(table) {}
    
    void AssignFrom(Iw& _rhs) { m_self = _rhs.m_self; m_table = _rhs.m_table; }
    void AssignFrom(Iw const& _rhs) const { const_cast<Iw*>(this)->m_self = _rhs.m_self; const_cast<Iw*>(this)->m_table = _rhs.m_table; }
    
    template<typename T> struct Stubs
    {
        static void inc1(void* _self) { ((T*)_self)->inc1(); }
        static const FnTable& GetTable()
        {
            static const FnTable t =
            {
                &inc1
            };
            return t;
        }
    };
    template<typename T> void CoerceFrom(T* _rhs) { m_self = _rhs; m_table = &Stubs<T>::GetTable(); }
    template<typename T> void CoerceFrom(const T* _rhs) const { const_cast<Iw*>(this)->m_self = const_cast<T*>(_rhs); const_cast<Iw*>(this)->m_table = &Stubs<T>::GetTable(); }
    
    void inc1() { m_table->inc1(m_self); }
    
    union
    {
        void* m_self;
    };
    FnTable const* m_table;
};

struct Ix
{
    struct FnTable
    {
        Iw::FnTable _Iw;
        void (*inc2)(void* _self);
    };
    Ix(void* _self, const FnTable* table) : m_self(_self), m_table(table) {}
    
    void AssignFrom(Ix& _rhs) { m_self = _rhs.m_self; m_table = _rhs.m_table; }
    void AssignFrom(Ix const& _rhs) const { const_cast<Ix*>(this)->m_self = _rhs.m_self; const_cast<Ix*>(this)->m_table = _rhs.m_table; }
    
    template<typename T> struct Stubs
    {
        static void inc2(void* _self) { ((T*)_self)->inc2(); }
        static const FnTable& GetTable()
        {
            static const FnTable t =
            {
                {
                    &Iw::Stubs<T>::inc1,
                },
                &inc2
            };
            return t;
        }
    };
    template<typename T> void CoerceFrom(T* _rhs) { m_self = _rhs; m_table = &Stubs<T>::GetTable(); }
    template<typename T> void CoerceFrom(const T* _rhs) const { const_cast<Ix*>(this)->m_self = const_cast<T*>(_rhs); const_cast<Ix*>(this)->m_table = &Stubs<T>::GetTable(); }
    
    void inc1() { m_table->_Iw.inc1(m_self); }
    void inc2() { m_table->inc2(m_self); }
    
    operator Iw() const { return Iw(m_self, &m_table->_Iw); }
    
    union
    {
        void* m_self;
    };
    FnTable const* m_table;
};

struct Iy
{
    struct FnTable
    {
        Iw::FnTable _Iw;
        void (*inc3)(void* _self);
    };
    Iy(void* _self, const FnTable* table) : m_self(_self), m_table(table) {}
    
    void AssignFrom(Iy& _rhs) { m_self = _rhs.m_self; m_table = _rhs.m_table; }
    void AssignFrom(Iy const& _rhs) const { const_cast<Iy*>(this)->m_self = _rhs.m_self; const_cast<Iy*>(this)->m_table = _rhs.m_table; }
    
    template<typename T> struct Stubs
    {
        static void inc3(void* _self) { ((T*)_self)->inc3(); }
        static const FnTable& GetTable()
        {
            static const FnTable t =
            {
                {
                    &Iw::Stubs<T>::inc1,
                },
                &inc3
            };
            return t;
        }
    };
    template<typename T> void CoerceFrom(T* _rhs) { m_self = _rhs; m_table = &Stubs<T>::GetTable(); }
    template<typename T> void CoerceFrom(const T* _rhs) const { const_cast<Iy*>(this)->m_self = const_cast<T*>(_rhs); const_cast<Iy*>(this)->m_table = &Stubs<T>::GetTable(); }
    
    void inc1() { m_table->_Iw.inc1(m_self); }
    void inc3() { m_table->inc3(m_self); }
    
    operator Iw() const { return Iw(m_self, &m_table->_Iw); }
    
    union
    {
        void* m_self;
    };
    FnTable const* m_table;
};

struct Iz
{
    struct FnTable
    {
        Ix::FnTable _Ix;
        Iy::FnTable _Iy;
        void (*inc4)(void* _self);
    };
    Iz(void* _self, const FnTable* table) : m_self(_self), m_table(table) {}
    
    void AssignFrom(Iz& _rhs) { m_self = _rhs.m_self; m_table = _rhs.m_table; }
    void AssignFrom(Iz const& _rhs) const { const_cast<Iz*>(this)->m_self = _rhs.m_self; const_cast<Iz*>(this)->m_table = _rhs.m_table; }
    
    template<typename T> struct Stubs
    {
        static void inc4(void* _self) { ((T*)_self)->inc4(); }
        static const FnTable& GetTable()
        {
            static const FnTable t =
            {
                {
                    {
                        &Iw::Stubs<T>::inc1,
                    },
                    &Ix::Stubs<T>::inc2,
                },
                {
                    {
                        &Iw::Stubs<T>::inc1,
                    },
                    &Iy::Stubs<T>::inc3,
                },
                &inc4
            };
            return t;
        }
    };
    template<typename T> void CoerceFrom(T* _rhs) { m_self = _rhs; m_table = &Stubs<T>::GetTable(); }
    template<typename T> void CoerceFrom(const T* _rhs) const { const_cast<Iz*>(this)->m_self = const_cast<T*>(_rhs); const_cast<Iz*>(this)->m_table = &Stubs<T>::GetTable(); }
    
    void inc1() { m_table->_Ix._Iw.inc1(m_self); }
    void inc2() { m_table->_Ix.inc2(m_self); }
    void inc3() { m_table->_Iy.inc3(m_self); }
    void inc4() { m_table->inc4(m_self); }
    
    operator Iw() const { return Iw(m_self, &m_table->_Ix._Iw); }
    operator Ix() const { return Ix(m_self, &m_table->_Ix); }
    operator Iy() const { return Iy(m_self, &m_table->_Iy); }
    
    union
    {
        void* m_self;
    };
    FnTable const* m_table;
};