From c3eef051e45712e406695290792b035f1a95a25b Mon Sep 17 00:00:00 2001 From: TrevorSundberg Date: Sun, 16 Feb 2020 21:41:25 -0800 Subject: [PATCH 1/3] Remill running on the web via Emscripten --- .gitignore | 3 + CMakeLists.txt | 27 ++++++- cmake/ccache.cmake | 3 +- remill/OS/OS.h | 2 +- web/README.md | 102 ++++++++++++++++++++++++++ web/build.sh | 6 ++ web/generate.sh | 7 ++ web/src/Dockerfile | 166 ++++++++++++++++++++++++++++++++++++++++++ web/src/FindXED.cmake | 12 +++ web/src/index.html | 58 +++++++++++++++ web/src/index.js | 12 +++ web/src/run.sh | 12 +++ 12 files changed, 406 insertions(+), 4 deletions(-) create mode 100644 web/README.md create mode 100755 web/build.sh create mode 100755 web/generate.sh create mode 100644 web/src/Dockerfile create mode 100644 web/src/FindXED.cmake create mode 100644 web/src/index.html create mode 100755 web/src/index.js create mode 100755 web/src/run.sh diff --git a/.gitignore b/.gitignore index 249217a83..2c9c247d6 100644 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,6 @@ obj-intel64/* # Lifted binaries *.lifted + +# Web build +web/build \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 31125c5b4..ae7c30b93 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,6 +20,29 @@ endif () project(remill) cmake_minimum_required(VERSION 3.2) +if(EMSCRIPTEN) + set(CMAKE_BUILD_TYPE Release) + set(gflags_DIR "$ENV{REPOS}/gflags/build") + set(glog_DIR "$ENV{REPOS}/glog/build") + set(LLVM_DIR "$ENV{REPOS}/llvm-project/llvm/build/lib/cmake/llvm/") + set(remill_DIR "$ENV{REPOS}/remill/build/") + + # We want to manually invoke main for Lift instead of Emscripten calling it for us + # So we disable INVOKE_RUN, export callMain (and FS for files). + set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/web/src") + set(CMAKE_CXX_FLAGS "\ + ${CMAKE_CXX_FLAGS} \ + $ENV{EM_CXX_FLAGS} \ + -D__i386__ \ + -DADDRESS_SIZE_BITS=32 \ + -s ALLOW_MEMORY_GROWTH=1 \ + --no-heap-copy \ + -s INVOKE_RUN=0 \ + -s EXTRA_EXPORTED_RUNTIME_METHODS='[\"callMain\",\"FS\"]' \ + -s ASSERTIONS=1 \ + --embed-file ${CMAKE_CURRENT_SOURCE_DIR}/web/build/remill/Arch/X86/Runtime/x86.bc@/share/remill/10.0/semantics/x86.bc") +endif() + include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/settings.cmake") include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/utils.cmake") include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/BCCompiler.cmake") @@ -227,7 +250,7 @@ if("${LIBRARY_REPOSITORY_ROOT}" STREQUAL "" OR NOT EXISTS "${LIBRARY_REPOSITORY_ message("InstallExternalTarget: Found llvm-link executable: ${llvmlink_location}") find_library("libllvm_location" "${dynamic_lib_prefix}LLVM-${REMILL_LLVM_VERSION}.${dynamic_lib_extension}") - if("${libllvm_location}" STREQUAL "libllvm_location-NOTFOUND") + if("${libllvm_location}" STREQUAL "libllvm_location-NOTFOUND" AND NOT EMSCRIPTEN) message(FATAL_ERROR "InstallExternalTarget: Failed to locate the libLLVM dynamic library") endif() @@ -313,7 +336,7 @@ add_subdirectory(remill/Arch/AArch64/Runtime) add_subdirectory(tools) # tests -if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "AppleClang") +if ("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "AppleClang" AND NOT EMSCRIPTEN) add_custom_target(test_dependencies) if(NOT "${PLATFORM_NAME}" STREQUAL "windows") diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake index 9d0274c0d..e4a87165e 100644 --- a/cmake/ccache.cmake +++ b/cmake/ccache.cmake @@ -16,7 +16,8 @@ cmake_minimum_required(VERSION 3.4) macro(configureCcache) if(NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" AND - NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") + NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin" AND + NOT EMSCRIPTEN) message(STATUS "ccache: Not supported") diff --git a/remill/OS/OS.h b/remill/OS/OS.h index b59bc9248..8f3da6444 100644 --- a/remill/OS/OS.h +++ b/remill/OS/OS.h @@ -24,7 +24,7 @@ # define REMILL_ON_LINUX 0 # define REMILL_ON_WINDOWS 0 # define REMILL_OS "macos" -# elif defined(__linux__) +# elif defined(__linux__) || defined(EMSCRIPTEN) # define REMILL_ON_MACOS 0 # define REMILL_ON_LINUX 1 # define REMILL_ON_WINDOWS 0 diff --git a/web/README.md b/web/README.md new file mode 100644 index 000000000..8ad8fec38 --- /dev/null +++ b/web/README.md @@ -0,0 +1,102 @@ +# Building +The only tool you need is docker to build the web version: +all tools and repositories are installed/checked out within the container. +For example, ccache is installed within the container and is already setup. + +```bash +# Generate the cmake/ninja build files +./web/generate.sh +# Build them into js/wasm files +./web/build.sh +``` + +# Using Lift +To see the web demo host an http server in the directory `web/build/tools/lift`. +The `index.html` has an example of how to execute it. + +To run it under node: +```bash +./web/build/tools/lift/index.js --bytes=90 --ir_out=out.ll +``` + +# Using the library +You can link against `web/build/libremill.a` which contains wasm binaries. + +# Debugging CMake +It is useful to specify `-DCMAKE_VERBOSE_MAKEFILE=ON` in any of the +generator calls to cmake inside the `Dockerfile` to see the exact +commands being passed to Emscriptens compiler and linker. + +# Issues + +### Warnings +There are several warnings due to conversions between signed/unsigned, pointer sizes, etc. +To enable all warnings again remove `-Wno-everything` in the `Dockerfile`. + +There are also warnings with CMake mostly related to the use of deprecated functions or that +Emscripten does not support dynamic linking (dll/so) and therefore reverts to static linkage. +To enable all these warnings remove all occurances of `-Wno-deprecated` and `-Wno-dev`. + +### Debug +Right now all the libraries are built in Release because in debug some libraries +such as LLVM end up specifying specific debug formats like dwarf and this breaks Emscripten. +This most likely can be fixed with a patch/sed to the LLVM cmake files. + +### 64 Bit +We're only doing 32 bit x86 right now because wasm64 support is still in the works. +Moreover, remill does not support targeting x64 from x86 (see `CMAKE_SIZEOF_VOID_P`). +We would also need to change the define `-D__i386__` and `-DADDRESS_SIZE_BITS=32` in the `CMakeLists.txt`. + +Because remill needs to load the semantic files, we embed `x86.bc` into the generated JavaScript. +Alternatively, instead of using `--embed-file` we could use `--preload-file` which is more efficient, +but does not work directly in NodeJS without polyfills: + +``` +--embed-file ${CMAKE_CURRENT_SOURCE_DIR}/web/build/remill/Arch/X86/Runtime/x86.bc@/share/remill/11.0/semantics/x86.bc +``` + +### Undefined symbols in LLVM +Currently we use `-s ERROR_ON_UNDEFINED_SYMBOLS=0` to avoid the following errors, but patches/sed would be better. +``` +warning: undefined symbol: __deregister_frame +warning: undefined symbol: __register_frame +warning: undefined symbol: posix_spawn_file_actions_adddup2 +warning: undefined symbol: posix_spawn_file_actions_addopen +warning: undefined symbol: posix_spawn_file_actions_destroy +warning: undefined symbol: posix_spawn_file_actions_init +``` + +### Undefined symbols in remill +The most notable undefined symbol in remill is `popen`. +Emscripten does not have an implementation for `popen` as there is no process model. +The other errors are most likely from linking LLVM. +``` +warning: undefined symbol: popen +warning: undefined symbol: posix_spawn_file_actions_adddup2 +warning: undefined symbol: posix_spawn_file_actions_addopen +warning: undefined symbol: posix_spawn_file_actions_destroy +warning: undefined symbol: posix_spawn_file_actions_init +``` + +### Unrolling loops +Compiling under Emscripten fails with `_Pragma("unroll")` and produces the warning: +``` +remill/Arch/X86/Semantics/SSE.cpp:937:9: warning: loop not unrolled: +the optimizer was unable to perform the requested transformation; +the transformation might be disabled or specified as part of an unsupported transformation ordering +[-Wpass-failed=transform-warning] +``` + +Another interesting note is that this pragma is most likely also embedded into the llvm +bitcode files because a similar warning is reported at runtime in wasm when the bitcode files are loaded: +``` +remill-lift-10.0.js:6361 warning: :0:0: loop not unrolled: +the optimizer was unable to perform the requested transformation; +the transformation might be disabled or specified as part of an unsupported transformation ordering +``` + +### Calling main once in Lift +Emscripten supports tearing down the state after main is called (global destructors, etc.) +by using `-s EXIT_RUNTIME=1` however it does not support calling main a second time. +A workaround is to expose a function that can be invoked more than once that is not `main` or +`callMain`. This can be done easily with Embind and passing `--bind` in `CMakeLists.txt`. \ No newline at end of file diff --git a/web/build.sh b/web/build.sh new file mode 100755 index 000000000..1ece82461 --- /dev/null +++ b/web/build.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash +set -e +cd `dirname "$0"`/build +../src/run.sh cmake --build . +cp -f ../src/index.html ./tools/lift +cp -f ../src/index.js ./tools/lift \ No newline at end of file diff --git a/web/generate.sh b/web/generate.sh new file mode 100755 index 000000000..0383d6785 --- /dev/null +++ b/web/generate.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +set -e +cd `dirname "$0"` +rm -rf build +mkdir -p build +cd build +../src/run.sh bash -c 'cmake -Wno-dev -GNinja -DCMAKE_TOOLCHAIN_FILE="$EM_TOOLCHAIN" ../..' \ No newline at end of file diff --git a/web/src/Dockerfile b/web/src/Dockerfile new file mode 100644 index 000000000..15bab764f --- /dev/null +++ b/web/src/Dockerfile @@ -0,0 +1,166 @@ +FROM ubuntu:19.10@sha256:bd5f4f235eb31768b2c5caf1988bbdc182d4fc3cb6ee4aca6c6d74613f256140 + +RUN apt-get update +RUN apt-get install -y \ + git + +RUN git config --global advice.detachedHead false + +# Checkout all the releases at specific tags to ensure updates don't break the build. +ENV REPOS="/repos" +WORKDIR "$REPOS" +RUN git clone https://github.com/llvm/llvm-project.git +RUN cd llvm-project && git checkout llvmorg-10.0.0-rc2 +RUN git clone https://github.com/juj/emsdk.git +RUN cd emsdk && git checkout 1458145cf4f3db0fb548343e6acab267eef8e4ef +RUN git clone https://github.com/intelxed/xed.git +RUN cd xed && git checkout 11.0.1 +RUN git clone https://github.com/intelxed/mbuild.git +RUN cd mbuild && git checkout 1e57534e2122a39382c68e134026b15a3370e5b1 +RUN git clone https://github.com/gflags/gflags.git +RUN cd gflags && git checkout v2.2.2 +RUN git clone https://github.com/google/glog.git +RUN cd glog && git checkout v0.4.0 + +RUN apt-get install -y \ + build-essential \ + ccache \ + clang \ + cmake \ + curl \ + default-jre \ + emscripten \ + libtinfo-dev \ + llvm \ + lsb-release \ + ninja-build \ + patchelf \ + python2.7 \ + wget \ + zlib1g-dev + +# Download a specific version of Emscripten (fastcomp fails with linker errors, upstream is required). +ENV EMSCRIPTEN_VERSION sdk-tag-1.39.7-64bit-upstream +RUN cd emsdk && \ + ./emsdk update-tags && \ + ./emsdk install $EMSCRIPTEN_VERSION && \ + ./emsdk activate --embedded $EMSCRIPTEN_VERSION + +ENV EMSDK="$REPOS/emsdk" +ENV EMSDK_NODE_BIN="$EMSDK/node/12.9.1_64bit/bin" +ENV EMSCRIPTEN="$EMSDK/upstream/emscripten" +ENV PATH="$EMSDK:$EMSCRIPTEN:$EMSDK_NODE_BIN:${PATH}" +ENV EM_CONFIG="$EMSDK/.emscripten" +ENV EM_PORTS="$EMSDK/.emscripten_ports" +ENV EM_CACHE="$EMSDK/.emscripten_cache" +ENV EMSDK_NODE="$EMSDK_NODE_BIN/node" +ENV EM_TOOLCHAIN="$EMSCRIPTEN/cmake/Modules/Platform/Emscripten.cmake" +ENV EMCC_WASM_BACKEND=1 +ENV EMCC_SKIP_SANITY_CHECK=1 + +ENV EM_CXX_FLAGS="-s ERROR_ON_UNDEFINED_SYMBOLS=0 -s USE_PTHREADS=0 -s WASM=1 -Wno-everything -O2 --closure 1 --llvm-lto 3" + +# ===== Build llvm-tblgen on Host ===== +# Building all of LLVM requires its own tool 'llvm-tblgen', however we can't use the Ubuntu package "llvm" +# because it is too old and doesn't have the latest wasm changes, so build a host version ourselves. +# Moreover, when building LLVM under Emscripten it will try and build tblgen but won't be able to run it +# since it's a .js file, not an exectuable. Therefore below we manually specify -DLLVM_TABLEGEN=... +RUN mkdir -p "$REPOS/llvm-project/build" +WORKDIR "$REPOS/llvm-project/build" +RUN cmake \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + ../llvm +RUN cmake --build . --target llvm-tblgen + +# ===== Build LLVM on Emscripten ===== +WORKDIR "$REPOS" +# LLVM attempts to check the compiler version but Emscripten's emcc outputs its version +# in a different format that can't be parsed by LLVM (a PR could be submitted upstream). +RUN echo "" > llvm-project/llvm/cmake/modules/CheckCompilerVersion.cmake + +RUN mkdir -p "$REPOS/llvm-project/llvm/build" +WORKDIR "$REPOS/llvm-project/llvm/build" + +RUN cmake \ + -Wno-deprecated \ + -Wno-dev \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DLLVM_DEFAULT_TARGET_TRIPLE=wasm32-unknown-unknown-wasm \ + -DLLVM_ENABLE_THREADS=OFF \ + -DLLVM_USE_SANITIZER=OFF \ + -DLLVM_ENABLE_EXPENSIVE_CHECKS=OFF \ + -DLLVM_ENABLE_BACKTRACES=OFF \ + -DLLVM_ENABLE_DUMP=OFF \ + -DLLVM_INCLUDE_TESTS=OFF \ + -DLLVM_INCLUDE_TOOLS=ON \ + -DLLVM_BUILD_TOOLS=ON \ + -DLLVM_BUILD_LLVM_DYLIB=ON \ + -DLLVM_LINK_LLVM_DYLIB=ON \ + -DLLVM_ENABLE_TERMINFO=OFF \ + -DLLVM_TARGETS_TO_BUILD= \ + -DLLVM_EXPERIMENTAL_TARGETS_TO_BUILD=WebAssembly \ + -DLLVM_TABLEGEN=$REPOS/llvm-project/build/bin/llvm-tblgen \ + -DCMAKE_TOOLCHAIN_FILE="$EM_TOOLCHAIN" \ + -DCMAKE_CXX_FLAGS="$EM_CXX_FLAGS" \ + -DCMAKE_STRIP=llvm-strip \ + -DCMAKE_AR="$EMSCRIPTEN/emar" \ + .. + +RUN cmake --build . --target LLVM + +# ===== Build XED on Emscripten ===== +WORKDIR "$REPOS/xed" +RUN ./mfile.py \ + --cc=emcc \ + --cxx=em++ \ + --linker=wasm-ld \ + --ar=emar \ + --host-cpu=x86 \ + --extra-cxxflags="$EM_CXX_FLAGS" \ + --extra-ccflags="$EM_CXX_FLAGS" + +# ===== Build gflags on Emscripten ===== +WORKDIR "$REPOS" +RUN mkdir -p "$REPOS/gflags/build" +WORKDIR "$REPOS/gflags/build" +RUN cmake \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE="$EM_TOOLCHAIN" \ + -DCMAKE_CXX_FLAGS="$EM_CXX_FLAGS" \ + .. +RUN cmake --build . + +# ===== Build glog on Emscripten ===== +WORKDIR "$REPOS" +RUN sed -i 's/\bHAVE_SYMBOLIZE 1\b/HAVE_SYMBOLIZE 0/g' glog/CMakeLists.txt +RUN mkdir -p "$REPOS/glog/build" +WORKDIR "$REPOS/glog/build" +RUN cmake \ + -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE="$EM_TOOLCHAIN" \ + -DCMAKE_CXX_FLAGS="$EM_CXX_FLAGS" \ + -DBUILD_TESTING=OFF \ + -Dgflags_DIR="$REPOS/gflags/build" \ + .. + +# Emscripten has a syscall.h, however it does not implement the exact signature for syscall +# and glog attempts to call it if the header exists, so just pretend it doesn't exist. +RUN sed -i 's/\bHAVE_SYSCALL_H\b/HAVE_SYSCALL_H_INVALID/g' config.h +RUN sed -i 's/\bHAVE_SYS_SYSCALL_H\b/HAVE_SYS_SYSCALL_H_INVALID/g' config.h + +RUN cmake --build . + +# ===== Build remill on Emscripten ===== +WORKDIR "$REPOS" + +# Bypass a remill CMake error about finding clang. +# This compiler is not used under Emscripten since it has it's own installed version of clang. +# The version must match the checked out version of llvm-project. +RUN cp /usr/bin/clang /usr/bin/clang-10.0 + +# The emscripten cache needs to be usable by the outside user (currently owned by root). +RUN chmod -R 666 "$EMSDK/.emscripten_cache.lock" \ No newline at end of file diff --git a/web/src/FindXED.cmake b/web/src/FindXED.cmake new file mode 100644 index 000000000..1fe86c4e3 --- /dev/null +++ b/web/src/FindXED.cmake @@ -0,0 +1,12 @@ +set(XED_FOUND TRUE) +set(XED_INCLUDE_DIRS + "/repos/xed/include/public" + "/repos/xed/obj/wkit/include/xed" +) +set(XED_LIBRARIES + "/repos/xed/obj/libxed.a" + "/repos/xed/obj/libxed-ild.a" +) +mark_as_advanced(FORCE XED_FOUND) +mark_as_advanced(FORCE XED_INCLUDE_DIRS) +mark_as_advanced(FORCE XED_LIBRARIES) \ No newline at end of file diff --git a/web/src/index.html b/web/src/index.html new file mode 100644 index 000000000..783a741e4 --- /dev/null +++ b/web/src/index.html @@ -0,0 +1,58 @@ + + + + + + + Remill Lift + + + + + x86 hex (32 bit only for now):
+
+ Output:
+
+ + + + + \ No newline at end of file diff --git a/web/src/index.js b/web/src/index.js new file mode 100755 index 000000000..6662bf0d5 --- /dev/null +++ b/web/src/index.js @@ -0,0 +1,12 @@ +#!/usr/bin/env node +const remill = require("./remill-lift-10.0.js"); +console.log( + "Using Emscripten's virtual file system instead of node fs, " + + "writing a file to /out will be printed, e.g. ir_out=/out"); +remill.onRuntimeInitialized = () => { + remill.callMain(process.argv.slice(2)); + try { + console.log(remill.FS.readFile('/out', { encoding: 'utf8' })) + } catch (err) { + } +} \ No newline at end of file diff --git a/web/src/run.sh b/web/src/run.sh new file mode 100755 index 000000000..5678ecb1b --- /dev/null +++ b/web/src/run.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -e +REPO=`git rev-parse --show-toplevel` +docker build -t remill/web $REPO/web/src +docker run \ + --rm \ + -u $(id -u):$(id -g) \ + -e CCACHE_DIR="$REPO/web/build/cache" \ + -v "$REPO:$REPO" \ + -w "`pwd`" \ + remill/web \ + "$@" \ No newline at end of file From 831caabdc8ddf46aed3ed578a13a83ff42292cac Mon Sep 17 00:00:00 2001 From: TrevorSundberg Date: Tue, 18 Feb 2020 08:44:35 -0800 Subject: [PATCH 2/3] Don't overwrite CMAKE_MODULE_PATH --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ae7c30b93..b0de6b3e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,7 @@ if(EMSCRIPTEN) # We want to manually invoke main for Lift instead of Emscripten calling it for us # So we disable INVOKE_RUN, export callMain (and FS for files). - set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/web/src") + list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/web/src") set(CMAKE_CXX_FLAGS "\ ${CMAKE_CXX_FLAGS} \ $ENV{EM_CXX_FLAGS} \ From b5d5e4283eb1df21d068b191a804f397c9f59f51 Mon Sep 17 00:00:00 2001 From: TrevorSundberg Date: Tue, 18 Feb 2020 09:01:35 -0800 Subject: [PATCH 3/3] Use __EMSCRIPTEN__ --- remill/OS/OS.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/remill/OS/OS.h b/remill/OS/OS.h index 8f3da6444..7bed55859 100644 --- a/remill/OS/OS.h +++ b/remill/OS/OS.h @@ -24,7 +24,7 @@ # define REMILL_ON_LINUX 0 # define REMILL_ON_WINDOWS 0 # define REMILL_OS "macos" -# elif defined(__linux__) || defined(EMSCRIPTEN) +# elif defined(__linux__) || defined(__EMSCRIPTEN__) # define REMILL_ON_MACOS 0 # define REMILL_ON_LINUX 1 # define REMILL_ON_WINDOWS 0