Tony Wasserka
@fail_cluez
C++::London
19 Feb 2018
First of all: Who am I?
Game/Application
Runs on emulated CPU
|
|||||
Kernel: Horizon
API emulation
|
|||||
ARM11 CPUs
Interpreter
|
Kernel↔Application interface: 130 system calls
System calls on hardware:
struct CPUContext {
uint32_t reg[16];
} cpu;
High-level emulation:
struct CPUContext {
uint32_t reg[16];
} cpu;
// No inputs, no outputs
void
DoExitProcess();
void HandleSVC(CPUContext& cpu, uint32_t index) {
switch (index) {
// ...
case SvcId::ExitCurrentProcess: // 0x3
DoExitCurrentProcess();
break;
// ...
}
}
// One input, one output
uint32_t
DoSleepThread(uint32_t duration);
void HandleSVC(CPUContext& cpu, uint32_t index) {
switch (index) {
// ...
case SvcId::SleepThread: // 0xa
uint32_t duration_ns = FromRegister<uint32_t>(cpu.reg[0]);
auto result = DoSleepThread(duration_ns);
cpu.reg[0] = ToRegister<uint32_t>(result); // result code
break;
// ...
}
}
// Three inputs, two outputs
std::tuple<uint32_t, VAddr>
DoControlMemory(uint32_t size, uint32_t perm, uint32_t mode);
void HandleSVC(CPUContext& cpu, uint32_t index) {
switch (index) {
// ...
case SvcId::ControlMemory: // 0x1
uint32_t size = FromRegister<uint32_t>(cpu.reg[1]);
uint32_t mode = FromRegister<uint32_t>(cpu.reg[2]);
uint32_t permissions = FromRegister<uint32_t>(cpu.reg[3]);
auto outputs = DoControlMemory(size, permissions, mode);
cpu.reg[0] = ToRegister<uint32_t>(std::get<0>(outputs)); // Error code
cpu.reg[1] = ToRegister<VAddr >(std::get<1>(outputs)); // Block address
break;
// ...
}
}
// Three inputs, two outputs
std::tuple<uint32_t, VAddr>
DoControlMemory(uint32_t size, uint32_t perm, uint32_t mode);
void HandleSVC(CPUContext& cpu, uint32_t index) {
switch (index) {
// ...
case SvcId::ControlMemory: // 0x1
uint32_t size = FromRegister<uint32_t>(cpu.reg[1]); // Starts at r1??
uint32_t mode = FromRegister<uint32_t>(cpu.reg[2]);
uint32_t permissions = FromRegister<uint32_t>(cpu.reg[3]);
auto outputs = DoControlMemory(size, permissions, mode);
cpu.reg[0] = ToRegister<uint32_t>(std::get<0>(outputs)); // Error code
cpu.reg[1] = ToRegister<VAddr >(std::get<1>(outputs)); // Block address
break;
// ...
}
}
// No inputs, two outputs
std::tuple<Result, Mutex*>
DoCreateMutex();
void HandleSVC(CPUContext& cpu, uint32_t index) {
switch (index) {
// ...
case SvcId::CreateMutex: // 0x13
auto outputs = DoCreateMutex();
cpu.reg[0] = ToRegister<uint32_t>(std::get<0>(outputs)); // Error code
cpu.reg[1] = ToRegister<Mutex* >(std::get<1>(outputs)); // Handle of created mutex
break;
// ...
}
}
// One input, one output
Result
DoLockMutex(Mutex* mutex, uint32_t timeout);
void HandleSVC(CPUContext& cpu, uint32_t index) {
switch (index) {
// ...
case SvcId::LockMutex: // 0x24
Mutex* mutex = FromRegister<Mutex* >(cpu.reg[0]);
uint32_t timeout = FromRegister<uint32_t>(cpu.reg[1]);
auto output = DoLockMutex(mutex, timeout);
cpu.reg[0] = ToRegister<uint32_t>(output);
break;
// ...
}
}
Don't Repeat Yourself
But how?
Building blocks:
// Three inputs, two outputs
std::tuple<uint32_t, DmaTracker*>
DoStartDma(Process* dst_process, VAddr dst_addr,
Process* src_proces, VAddr src_addr, uint32_t size);
void HandleSVC(CPUContext& cpu, uint32_t index) {
switch (index) {
// ...
case SvcId::StartDma: // 0x1
Process* dst_proc = FromRegister<Process*>(cpu.reg[1]);
VAddr dst_addr = FromRegister<VAddr >(cpu.reg[2]);
Process* src_proc = FromRegister<Process*>(cpu.reg[3]);
VAddr src_addr = FromRegister<VAddr >(cpu.reg[0]);
uint32_t data_size = FromRegister<uint32_t>(cpu.reg[4]);
auto outputs = DoStartDma(dst_proc, dst_addr, src_proc, src_addr, data_size);
cpu.reg[0] = ToRegister<uint32_t >(std::get<0>(outputs)); // Error code
cpu.reg[1] = ToRegister<DmaObject*>(std::get<1>(outputs)); // DMA object
break;
// ...
}
}
std::tuple<Result, DmaObject*>
DoStartDma(Process* dst_process, VAddr dst_data_addr,
Process* src_process, VAddr src_data_addr,
uint32_t data_size);
↓ Function Traits ↓
using InData = std::tuple<Process*, VAddr, Process*, VAddr, uint32_t>
using OutData = std::tuple<Result, DmaObject*>
↓ Generators ↓
template<typename InData> InData Decode(CPU& cpu)
template<typename OutData> void Encode(CPU& cpu, OutData... data)
↓ Combine ↓
case SvcId::StartDma:
WrapSVCImpl<DoStartDma>(cpu);
break;
std::tuple<uint32_t, DmaTracker*>
DoStartDma(Process* dst_process, VAddr dst_addr,
Process* src_process, VAddr src_addr, uint32_t size)
void HandleSVC(CPUContext& cpu, uint32_t index) {
switch (index) {
// ...
case SvcId::StartDma:
WrapSVCImpl<DoStartDma>(cpu);
break;
// ...
}
}
template<typename F, typename T>
T TransformOne(F func, T value) {
return func(value);
}
template<typename F, typename T1, typename T2>
std::tuple<T1, T2> TransformTwo(F func, T1 value1, T2 value2) {
return std::tuple<T1, T2> { func(value1), func(value2) };
}
Can take any number of arguments
Implementation using parameter pack expansion
template<typename F, typename... T>
std::tuple<T...> TransformMany(F func, T... values) {
}
template<typename F, typename... T>
std::tuple<T...> TransformMany(F func, T... values) {
return std::tuple<T...> { func(values) ... };
}
TransformMany<T1> == TransformOne<T1>
TransformMany<T1, T2> == TransformTwo<T1, T2>
void PrintNumberAndString(int num, const char* str) {
std::cout << "Number: " << num << std::endl;
std::cout << "String: " << str << std::endl;
}
auto tuple = std::make_tuple(5, "Hello World");
std::apply(PrintNumberAndString, tup);
Number: 5 String: Hello World
template<typename... T>
std::tuple<T...> Decode(CPUContext& cpu) {
unsigned reg_index = 0;
return FromRegister<T>(cpu.reg[reg_index++]) ;
}
case SvcId::StartDma:
auto inputs = Decode<Process*,Vaddr,Process*,VAddr,uint32_t>(cpu);
auto outputs = std::apply(DoStartDma, inputs);
template<typename... T>
std::tuple<T...> Decode(CPUContext& cpu) {
unsigned reg_index = 0;
return std::tuple<T...> { FromRegister<T>(cpu.reg[reg_index++]) ... };
}
case SvcId::StartDma:
auto inputs = Decode<Process*,Vaddr,Process*,VAddr,uint32_t>(cpu);
auto outputs = std::apply(DoStartDma, inputs);
case SvcId::StartDma:
auto inputs = Decode<Process*,Vaddr,Process*,VAddr,uint32_t>(cpu);
auto outputs = std::apply(DoStartDma, inputs);
vs.
case SvcId::StartDma:
auto dst_proc = FromRegister<Process*>(cpu.reg[1]);
auto dst_addr = FromRegister<VAddr >(cpu.reg[2]);
auto src_proc = FromRegister<Process*>(cpu.reg[3]);
auto src_addr = FromRegister<VAddr >(cpu.reg[0]);
auto data_size = FromRegister<uint32_t>(cpu.reg[4]);
DoStartDma(dst_proc, dst_addr, src_proc, src_addr, data_size);
Pack expansion while applying an operator
Syntax:
template<typename F, typename... T>
auto AccumulateMany(F func, T... values) {
return (func(values) + ... );
}
AccumulateMany(twice, 5) == twice(5)
AccumulateMany(twice, 5, 64) == twice(5)+twice(64)
cpu->reg[reg_index++] = ToRegister<T>(t)
template<typename... T>
void Encode(const T&... values) {
unsigned reg_index = 0;
((cpu.reg[reg_index++] = ToRegister<T>(values)), ...);
}
auto outputs = std::apply(DoStartDma, inputs);
std::apply(Encode<uint32_t,DmaObject*>, outputs);
struct is_pointer<int*> {
bool value = true;
};
struct is_pointer<int> {
bool value = false;
};
template<typename F>
struct FunctionTraits {
using Args = std::tuple< /* Parameter list of F */ >;
using Result = /* Return type of F */;
};
Implementations available in
Minimal implementation for our use case
template<typename F>
struct FunctionTraits;
// Specialisation
template<typename FuncResult, typename... FuncArgs>
struct FunctionTraits<Result(Args...)> {
using Args = std::tuple<FuncArgs...>;
using Result = FuncResult;
};
Very limited, but good enough here
std::tuple<Result, DmaObject*>
DoStartDma(Process* dst_process, VAddr dst_data_addr,
Process* src_process, VAddr src_data_addr,
uint32_t data_size);
↓ Function Traits ↓
using InData = FunctionTraits<decltype(DoStartDma)>::Args;
using OutData = FunctionTraits<decltype(DoStartDma)>::Result;
↓ Generators ↓
InData Decode<InData>(CPU& cpu)
void Encode<OutData>(CPU& cpu, OutData... data)
↓ Combine ↓
case SvcId::StartDma:
WrapSVCImpl<DoStartDma>(cpu);
break;
struct CPU {
uint32_t reg[16];
};
// Dummy kernel object types
struct Process {};
struct DmaObject {};
using VAddr = uint32_t;
std::tuple<uint32_t, DmaObject*>
DoStartDma(Process* dst_process, VAddr dst_addr,
Process* src_process, VAddr src_addr, uint32_t data_size) {
std::cout << /* log input arguments */ << std::endl;
// Return some unique-ish dummy values
return std::tuple(uint32_t{src_addr * dst_addr}, new DmaObject);
}
template<typename T>
T FromRegister(uint32_t value) {
std::cout << "Decoding value " << value << '\n';
if constexpr (!std::is_pointer_v<T>) {
return static_cast<T>(value);
} else {
// Just hack together some pointer based on the given value
return reinterpret_cast<T>(static_cast<uintptr_t>(value));
}
}
template<typename T>
uint32_t ToRegister(T value) {
std::cout << "Encoding value " << value << '\n';
if constexpr (!std::is_pointer_v<T>) {
return static_cast<uint32_t>(value);
} else {
// Just return the raw object address as a value
return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(value));
}
}
Decode requires a parameter pack:
template<typename... Args>
std::tuple<Args...> Decode(CPU& cpu) { /* ... */ }
… but FunctionTraits gave us a std::tuple
Let's try partial template specialisation:
template<typename ArgsTuple>
ArgsTuple Decode(CPU&);
template<typename... Args>
std::tuple<Args...> Decode<std::tuple<Args...>>(CPU&) {
// ...
}
error: function template partial specialization is not allowed
We can't partially specialise functions…
… but we can partially specialise function objects!
template<typename ArgsTuple>
struct Decoder;
template<typename... Args>
struct Decoder<std::tuple<Args...>> {
static std::tuple<Args...> Decode(CPU& cpu) {
size_t reg_index = 0;
return std::tuple<Args... > {
FromRegister<Args>(cpu.reg[reg_index++])
...
};
}
};
Similarly, the Encoder must be a function object too:
template<typename ArgsTuple>
struct Encoder;
template<typename... Args>
struct Encoder<std::tuple<Args...>> {
CPU& cpu;
void operator()(Args... args) {
size_t reg_index = 0;
((cpu.reg[reg_index++] = ToRegister<Args>(args)), ...);
}
};
Encoder<OutArgs> encoder{cpu}; // construct stateful function object
std::apply(encoder, outputs);
template<auto SVCImpl>
void WrapSVCImpl(CPU& cpu) {
// Turn handler's function signature into std::tuples
using InArgs = typename FunctionTraits<decltype(SVCImpl)>::InArgs;
using OutArgs = typename FunctionTraits<decltype(SVCImpl)>::OutArgs;
// Decode registers by calling stateless function object
auto args = Decoder<OutArgs, InArgs>::Decode(cpu);
// Call handler
auto result = std::apply(SVCImpl, args);
// Encode registers by calling stateful function object
Encoder<OutArgs> encoder{cpu};
std::apply(encoder, result);
}
void HandleSVC(CPU& cpu, uint32_t index) {
switch (index) {
// ...
case 0x55:
WrapSVCImpl<DoStartDma>(cpu);
break;
case 0x56:
WrapSVCImpl<DoStopDma>(cpu);
break;
case 0x57:
WrapSVCImpl<GetDmaState>(cpu);
break;
case 0x58:
WrapSVCImpl<DoRestartDma>(cpu);
break;
// ...
}
}
int main() {
CPU cpu {{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }};
HandleSVC(cpu, 0x55);
std::cout << '\n' << "CPU registers on return:" << '\n';
for (auto reg : cpu.reg)
std::cout << "0x" << std::hex << reg << ' ';
}
g++ –std=c++1z example.cpp
int main() {
CPU cpu {{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }};
HandleSVC(cpu, 0x55);
std::cout << '\n' << "CPU registers on return:" << '\n';
for (auto reg : cpu.reg)
std::cout << "0x" << std::hex << reg << ' ';
}
Decoding value 0 Decoding value 1 Decoding value 2 Decoding value 3 Decoding value 4 DoStartDma: dst_proc=0x0, dst_addr=1, src_proc=0x2, src_addr=3, size=4 returning 3 and 0x240b280 Encoding value 3 = 1 * 3 Encoding value 0x25ea280 CPU registers on return: 0x3 0x25ea280 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0x0 0x0 0x0 0x0 0x0 0x0
The best code is the code you didn't have to write!
Introduction to template metaprogramming:
Arthur O'Dwyer, Template Normal Programming
Fold expressions in detail:
Vittorio Romeo, Introduction to C++ origami
Decode() was cheating before:
size_t reg_index = 0;
return std::tuple<Args... > {
FromRegister<Args>(cpu.reg[reg_index++]) ...
};
This doesn't reflect the proper register order:
r1 → r2 → r3 → r0 → r4
Luckily, we can just replace reg_index++ with
template<size_t first_reg>
size_t GetAndThenUpdateRegIndex(size_t& cur_index);
Ad-hoc logic to reproduce the 3DS register order:
template<size_t first_reg>
size_t GetAndThenUpdateRegIndex(size_t& cur_index) {
size_t old_index = cur_index;
++cur_index;
if (cur_index == 4) cur_index = 0;
if (cur_index == first_reg) cur_index = 4;
return old_index;
}
constexpr size_t first_reg = std::tuple_size_v<OutData> - 1;
template<typename OutData, typename... Args>
static std::tuple<Args...> Decode(CPU& cpu) {
constexpr size_t first_reg = std::tuple_size_v<OutData> - 1;
size_t reg_index = first_reg;
return std::tuple<Args... > {
FromRegister<Args>(
cpu.reg[GetAndThenUpdateRegIndex<first_reg>(reg_index)])
...
};
}
Decoding value 1 Decoding value 2 Decoding value 3 Decoding value 0 Decoding value 4 DoStartDma: dst_proc=0x1, dst_addr=2, src_proc=0x3, src_addr=0, size=4 returning 0 and 0x240b280 Encoding value 0 = 2 * 0 Encoding value 0x25ea280 CPU registers on return: 0x0 0x25ea280 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0x0 0x0 0x0 0x0 0x0 0x0