Interacting with the environment

Up until this point, our compiler has had a curious property. Here's another compiler we could have written for our language:

compile.ml

let rec compile_value (stack_index : int) (v : Interp.value) =
  match v with
  | Number n ->
      [Mov (Reg Rax, operand_of_num n)]
  | Boolean b ->
      [Mov (Reg Rax, operand_of_bool b)]
  | Pair (v1, v2) ->
      compile_value stack_index v1
      @ [Mov (stack_address stack_index, Reg Rax)]
      @ compile_value (stack_index - 8) v2
      @ [ Mov (Reg R8, stack_address stack_index)
        ; Mov (MemOffset (Reg Rdi, Imm 0), Reg R8)
        ; Mov (MemOffset (Reg Rdi, Imm 8), Reg Rax)
        ; Mov (Reg Rax, Reg Rdi)
        ; Or (Reg Rax, Imm pair_tag)
        ; Add (Reg Rdi, Imm 16) ]

let compile (program : s_exp) : string =
  [Global "entry"; Label "entry"]
  @ compile_value (Interp.interp_exp Symtab.empty program)
  @ [Ret]
  |> List.map string_of_directive
  |> String.concat "\n"

That's quite a bit simpler! It's even a better compiler–the programs it outputs are short and execute very efficiently. Given that it has been much easier to write our interpreter than to write our compiler, why not just do this? Why does anyone work on compilers?

The answer, of course, is that most programs don't just compute the answer to a fixed expression! We usually write programs because we want to do the same operation multiple times on different inputs. However, in our current language, there's no way to get an input from the outside world. Let's fix that now!

Input

Adding input to the AST

We want to add the language construct (read-num) to our input language. Notice a subtlety here: unlike true and false, which are values, read-num is a function -- we enclose it in parentheses to "call" it. It corresponds to the S-expression Lst [Sym "read-num"].

We'll introduce a type of 0-ary primitives in our AST.

ast.ml

type prim0 = ReadNum

let prim0_of_string (s : string) : prim0 option =
  match s with "read-num" -> Some ReadNum | _ -> None

type expr =
...
  | Prim0 of prim0
...

let rec expr_of_s_exp (e : s_exp) : expr =
  match e with
...
  | Lst [Sym op] when Option.is_some (prim0_of_string op) ->
      Prim0 (Option.get (prim0_of_string op))
...

Adding input to the interpreter

interp.ml

let rec interp_exp env (exp : expr) : value =
  match exp with
  | Prim0 ReadNum ->
      Number (input_line stdin |> int_of_string)
  (* ... *)

Simple enough! We get slightly weird behavior if we do this, though (at least on my machine):

> interp "(pair (read-num) (read-num))";;
1
2
(pair 2 1)

What's going on here? As it turns out, the problem isn't with read-num–it's with pair!

interp.ml

let rec interp_exp env (exp : expr) : value =
  match exp with
  (* ... *)
  | Prim2 (Pair, e1, e2) ->
      Pair (interp_exp env e1, interp_exp env e2)
  (* ... *)

We're calling interp_exp twice, and each of those ends up reading input. But it seems like the second one is happening first!

The order in which OCaml evaluates arguments to functions (in this case, the function that constructs a pair) is actually unspecified: the implementation can evaluate them in whatever order it likes. This often doesn't really matter–for pure expressions like most of the ones we've been dealing with, it doesn't matter when they are evaluated! Now, though, we've introduced a side-effect: reading input. It really does matter now when those calls to interp_exp get evaluated!

Our compiler evaluates arguments to binary operations like pair in left to right order. We'll do the same in the interpreter:

interp.ml

let rec interp_exp env (exp : expr) : value =
  match exp with
  (* ... *)
  | Prim2 (Pair, e1, e2) ->
      let l = interp_exp env e1 in
      let r = interp_exp env e2 in
      Pair (l, r)
  (* ... *)


let interp (program : string) : string =
  interp_exp Symtab.empty (parse program)

Adding input to the compiler

First, we'll add a function to the C runtime to read a number:

runtime.c

uint64_t read_num() {
  int r;
  scanf("%d", &r);
  return (uint64_t)(r) << num_shift;
}

We'll need to let the assembler know that the read_num label is defined in the runtime:

compile.ml

let compile (program : expr) : string =
  [ Global "entry"
  ; Extern "error"
  ; Extern "read_num"
  ; Label "entry" ]
  @ compile_exp Symtab.empty (-8) program
  @ [Ret]
  |> List.map string_of_directive
  |> String.concat "\n"

Now we need to actually compile the (read-num) form into a call to this function. In our error-handling code, we were able to "call" C functions just by jumping to the right label. Fundamentally that's still what's going to happen, but we're going to need to do some additional work to make sure our program can keep executing after the function call:

read_num might use the rdi register where our heap pointer lives. We'll need to save the heap pointer to the stack before read_num starts executing.
Speaking of the stack, we've been writing to stack locations like [rsp - 8], [rsp - 16], etc. read_num might also want to write to [rsp - 8], which will be a problem since we might have data there! The solution is to adjust rsp and restore it after the call.
We haven't modified memory at [rsp + 0] because that's where our function's return address is–when we call ret, the processor resumes operations at that address. Instead of calling read_num using jmp, we'll use call, which will put the new return address on the top of the stack.
Finally: C functions on x86-64 require that when they start executing, rsp (the location of the return address) is a multiple of 16. So, we'll need to further adjust rsp to make sure this property holds.

Here's the code we end up with:

compile.ml

let align_stack_index (stack_index : int) : int =
  if stack_index mod 16 = -8 then stack_index else stack_index - 8

let rec compile_exp (tab : int symtab) (stack_index : int) (exp : s_exp) :
    directive list =
  match exp with
  | Prim0 ReadNum ->
      [ Mov (stack_address stack_index, Reg Rdi)
      ; Add (Reg Rsp, Imm (align_stack_index stack_index))
      ; Call "read_num"
      ; Sub (Reg Rsp, Imm (align_stack_index stack_index))
      ; Mov (Reg Rdi, stack_address stack_index) ]
  (* ... *)

See the lecture capture for an explanation of why this works, including some worked examples.

Output

We finished last class being able to take input from the user. It makes sense to also be able to write output. Specifically, we'll implement (print e) and (newline).

So far, our interactions with our compiler have been kind of REPL-like, maybe without the L: we read an S-expression, evaluate it, and print the result. Since every program in our language is a value, this makes sense. But when we add input and output side effects, it makes less sense. We might want to write a program that prints multiple things, or does a computation and prints nothing at all.

Adding output to the AST

On HW4, you may have seen that we provided you with a do function, which worked a lot like OCaml's ;. This wasn't needed before because we weren't writing programs with side effects. Now, it will be useful for us to do this kind of sequencing, so we'll implement do in the class interpreter and compiler as well.

ast.ml

type prim0 = ReadNum | Newline

let prim0_of_string (s : string) : prim0 option =
  match s with
  (* ... *)
  | "newline" ->
      Some Newline

type prim1 = Add1 | Sub1 | ZeroP | NumP | Not | Left | Right | Print

let prim1_of_string (s : string) : prim1 option =
  match s with
  (* ... *)
  | "print" ->
      Some Print


type expr =
   (* ... *)
  | Do of expr list

Adding output to the interpreter

Unsurprisingly, adding output to the interpreter looks a lot like input. We use OCaml functionality to make it easy.

It's a somewhat arbitrary choice to have (print e) return true. But it's an easy one to implement, and a sensible option: really, print is a "void" function, and we don't care about its output.

interp.ml

let output_channel = ref stdout

let rec interp_exp env (exp : expr) : value =
  match exp with
  (* ... *)
  | Do exps ->
      exps |> List.rev_map (interp_exp env) |> List.hd
  | Prim1 (Print, e) ->
      interp_exp env e |> string_of_value |> output_string stdout ;
      Boolean true
  | Prim0 Newline ->
      output_string stdout "\n" ;

We should also change our main calling function so that it doesn't print the return value automatically.

let interp (program : string) : unit =
  parse program |> interp_exp Symtab.empty |> ignore

Adding output to the compiler

In the runtime

We have very little to do in the runtime. In fact, we've already implemented a (C) version of print_value. So we can just call this as an external function! We only need to remove it, again, from our main calling function.

We'll add a function print_newline() that doesn't do very much.

runtime.c

int main(int argc, char **argv) {
  void *heap = (void *)malloc(4096);
  entry(heap);
  return 0;
}

void print_newline() {
  printf("\n");
}

We'll then recompile the runtime:

gcc -c runtime.c -o runtime.o

In the compiler

Our task list is familiar now. First, we'll make the assembler aware of the C functions we want to use.

compile.ml

let compile (program : expr) : string =
  [ Global "entry"
  ; Extern "error"
  ; Extern "read_num"
  ; Extern "print_value"
  ; Extern "print_newline"
  ; Label "entry" ]
  @ compile_exp Symtab.empty (-8) program
  @ [Ret]
  |> List.map string_of_directive
  |> String.concat "\n"

The compiler changes will look a lot like the read-num case. Remember that the C calling convention expects to find the input to print_value in rdi. We have to make sure this value is there, and that we return the value true at the end in rax.

newline is even simpler, since we don't need to worry about the argument.

let rec compile_exp (tab : int symtab) (stack_index : int) (exp : expr) :
    directive list =
  match exp with
  | Do exps ->
      List.map (fun exp -> compile_exp tab stack_index exp) exps
      |> List.concat
  | Prim1 (Print, e) -> 
    compile_exp tab stack_index e
    @ [ Mov (stack_address stack_index, Reg Rdi)
      ; Mov (Reg Rdi, Reg Rax)
      ; Add (Reg Rsp, Imm (align_stack_index stack_index))
      ; Call "print_value"
      ; Sub (Reg Rsp, Imm (align_stack_index stack_index))
      ; Mov (Reg Rdi, stack_address stack_index)
      ; Mov (Reg Rax, operand_of_bool true) ]
  | Prim0 Newline ->
      [ Mov (stack_address stack_index, Reg Rdi)
      ; Add (Reg Rsp, Imm (align_stack_index stack_index))
      ; Call "print_newline"
      ; Sub (Reg Rsp, Imm (align_stack_index stack_index))
      ; Mov (Reg Rdi, stack_address stack_index)
      ; Mov (Reg Rax, operand_of_bool true) ]
  (* ... *)

let compile_and_run (program : string) : unit =
  compile_to_file program ;
  ignore (Unix.system "nasm program.s -f elf64 -o program.o") ;
  ignore
    (Unix.system "gcc program.o runtime.o -o program -z noexecstack") ;
  ignore (Unix.system "./program")

Updating difftest infrastructure

Our testing infrastructure is now all messed up. First of all, our tests will need to print something, otherwise we won't be able to observe the results of the computations. Second of all, we probably want to test (read-num), which expects user input!

We're not going to talk through the details of this testing infrastructure. The bottom line is, we should be able to give our tester input strings along with the programs we want to run, and compare the entire output.

interp.ml

let interp (program : string) : unit =
  interp_exp Symtab.empty (parse program) |> ignore

let interp_io (program : string) (input : string) =
  let input_pipe_ex, input_pipe_en = Unix.pipe () in
  let output_pipe_ex, output_pipe_en = Unix.pipe () in
  input_channel := Unix.in_channel_of_descr input_pipe_ex ;
  set_binary_mode_in !input_channel false ;
  output_channel := Unix.out_channel_of_descr output_pipe_en ;
  set_binary_mode_out !output_channel false ;
  let write_input_channel = Unix.out_channel_of_descr input_pipe_en in
  set_binary_mode_out write_input_channel false ;
  let read_output_channel = Unix.in_channel_of_descr output_pipe_ex in
  set_binary_mode_in read_output_channel false ;
  output_string write_input_channel input ;
  close_out write_input_channel ;
  interp program ;
  close_out !output_channel ;
  let r = input_all read_output_channel in
  input_channel := stdin ;
  output_channel := stdout ;
  r

let interp_err (program : string) (input : string) : string =
  try interp_io program input with BadExpression _ -> "ERROR"

compile.ml

let compile_and_run_io (program : string) (input : string) : string =
  compile_to_file program ;
  ignore (Unix.system "nasm program.s -f macho64 -o program.o") ;
  ignore (Unix.system "gcc program.o runtime.o -o program") ;
  let inp, outp = Unix.open_process "./program" in
  output_string outp input ;
  close_out outp ;
  let r = input_all inp in
  close_in inp ; r

let compile_and_run_err (program : string) (input : string) : string =
  try compile_and_run_io program input with BadExpression _ -> "ERROR"

let difftest (examples : (string * string) list) =
  let results =
    List.map
      (fun (ex, i) -> (compile_and_run_err ex i, Interp.interp_err ex i))
      examples
  in
  List.for_all (fun (r1, r2) -> r1 = r2) results

let test () = difftest [("(print (read-num))", "1")]