diff --git a/.github/ci.sh b/.github/ci.sh index e9a549147c..9cadb596e6 100755 --- a/.github/ci.sh +++ b/.github/ci.sh @@ -84,15 +84,17 @@ build_cryptol() { bundle_files() { mkdir -p dist dist/{bin,deps,doc,examples,include,lib} + mkdir -p dist/doc/{llvm-java-verification-with-saw,rust-verification-with-saw,saw-user-manual} cp LICENSE README.md dist/ $IS_WIN || chmod +x dist/bin/* (cd deps/cryptol-specs && git archive --prefix=cryptol-specs/ --format=tar HEAD) | (cd dist/deps && tar x) - cp doc/extcore.md dist/doc - cp doc/tutorial/sawScriptTutorial.pdf dist/doc/tutorial.pdf - cp doc/manual/manual.pdf dist/doc/manual.pdf - cp -r doc/tutorial/code dist/doc + cp doc/pdfs/llvm-java-verification-with-saw.pdf dist/doc/llvm-java-verification-with-saw + cp doc/pdfs/rust-verification-with-saw.pdf dist/doc/rust-verification-with-saw + cp doc/pdfs/saw-user-manual.pdf dist/doc/saw-user-manual + cp -r doc/llvm-java-verification-with-saw/code dist/doc/llvm-java-verification-with-saw + cp -r doc/rust-verification-with-saw/code dist/doc/rust-verification-with-saw cp intTests/jars/galois.jar dist/lib cp -r deps/cryptol/lib/* dist/lib cp -r examples/* dist/examples diff --git a/CHANGES.md b/CHANGES.md index c4e74a4bce..2061462556 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,12 @@ ## New Features +* SAW documentation is now under a single Sphinx umbrella, resulting in a + complete overhaul of the `doc/` directory (#1723). Generally speaking, all + _content_ has stayed the same, and only organization has changed. Most + importantly for users, the PDF artifacts included in CI-generated releases + have different names and render with Sphinx styling. + * Add a `bitwuzla` family of proof scripts that use the Bitwuzla SMT solver. * Add a `:tenv` REPL command, which is like `:env` but prints the type diff --git a/doc/.env b/doc/.env new file mode 100644 index 0000000000..9d8625fda5 --- /dev/null +++ b/doc/.env @@ -0,0 +1 @@ +SOURCE_DATE_EPOCH=1743465600 diff --git a/doc/.gitignore b/doc/.gitignore new file mode 100644 index 0000000000..6c95d53874 --- /dev/null +++ b/doc/.gitignore @@ -0,0 +1,5 @@ +__pycache__ +*.egg-info + +_build +*.tar.gz diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000000..8df7621ccd --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,45 @@ +# SAW Documentation Makefile +# This is derived from Sphinx's default Makefile +include .env + +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +SUPPORTED_DOCS = html latex latexpdf linkcheck + +help: + $(info SAW Documentation (based on Sphinx)) + $(info Setup Python environment: 'make setup-env') + $(info Supported document formats: 'make' + one of: $(SUPPORTED_DOCS)) + $(info Install PDFs to repository: 'make install-pdf') + $(info Tidy code packages / docs: 'make mostlyclean') + $(info Tidy everything, including Python venv: 'make clean') + +sphinx-help: setup-env + @. .venv/bin/activate && $(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: setup-env pdf install-pdf mostlyclean clean help $(SUPPORTED_DOCS) + +setup-env: + scripts/setup_env.sh + +pdf: latexpdf + +install-pdf: latexpdf + mkdir -p pdfs + cp -f $(shell ls _build/latex/*.pdf | sed '/\/galois\.pdf$$/d') pdfs + +mostlyclean: + rm -i $(shell find . -path ./$(BUILDDIR) -prune -o -name '*.tar.gz' -print) + rm -rf _build + +clean: mostlyclean + rm -rf .venv + +$(SUPPORTED_DOCS): setup-env + @SOURCE_DATE_EPOCH=${SOURCE_DATE_EPOCH} . .venv/bin/activate && $(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +%: + $(error "Target '$@' unknown; try 'make help'.") diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 0000000000..03b84c4865 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,292 @@ +# SAW Documentation + +Welcome to the home of all things SAW documentation! + +This `README` covers: + +- Building the documentation +- The structure of `doc/` (this directory), including how to add new materials +- Information for developers working on the Sphinx build itself, and not just + documentation content + +## Building the documentation + +For the sake of rapid editing and simple deployment, we have attempted to make +it as easy as possible to build SAW's documentation, especially for those only +interested in modifying the documentation _content_. + +### Prerequisites + +The only _strict_ requirement to build the documentation locally is a relatively +recent Python (>=3.9, to be safe - we aren't prioritizing making this work on +every Python under the sun), and `make`. + +This is enough to generate the HTML versions of documentation, which is what is +deployed to . + +If you want to generate the PDFs that can be found in `doc/pdfs`, you'll also +need `xelatex`. +This will require a suitable TeX distribution. + +### `Makefile` + +As is typical for Sphinx projects, SAW's documentation is built using `make`. + +Our `Makefile` is a tailored version of that which is generated when starting a +new Sphinx project, in particular restricting the document formats that can be +targeted, and handling the setup of an appropriate Python environment in which +to run the Sphinx build. + +Typing `make help` is the easiest way to get started. +The following targets are available: + +- `help`: Display all of the most useful targets +- `sphinx-help`: Display Sphinx's built-in help (**NOTE:** Most of these targets + are purposefully made unavailable and won't do anything!) +- `setup-env`: Create a Python venv (in `doc/.venv/`) suitable for building the + documentation. + This tries not to do unnecessary work: If you already have a `doc/.venv/`, it + will simply install the dependencies the build requires. +- `package-code`: Create tar.gz archives of all `doc/**/code/` directories. + This can be used to provide downloads in the HTML rendering of the + documentation (see [below](#code-examples)). +- `install-pdf`: Build and install PDF renderings to `doc/pdfs/`. +- `mostlyclean`: Clear out all packaged code and Sphinx-generated files. +- `clean`: All of the above, plus the Python environment. + +`make help` always shows the most up-to-date list of supported document targets. + +#### `scripts/` + +This directory contains utility scripts (and the documentation's +`requirements.txt`) used to implement the `make` targets above. + +- `scripts/setup_env.sh` creates and validates a suitable Python environment, + independent of the documentation build. + It uses the sibling `requirements.txt` file, and can be run anywhere you want + to create a `.venv/` suitable for building SAW documentation (though in most + cases, you'll just be running `make setup-env` in `doc/`). +- `scripts/package_code.sh` finds all directories named `code` in the file tree, + and if there is no sibling `code.tar.gz`, creates it. + This is the implementation of `make package-code`, but like + `scripts/setup_env.sh` can be run anywhere. + +### Troubleshooting + +1. If the build succeeds, but you aren't seeing all of the expected changes: + + Try running `make mostlyclean` before rebuilding; Sphinx may have failed to + detect the changes you made, and you're seeing stale/cached output. +2. If you have unresolved reference warnings: + + Sphinx has an incredibly robust cross-reference system, but it is easy to get + "wrong". + If using Markdown/MyST, make sure you are using references according to [this + documentation](https://myst-parser.readthedocs.io/en/latest/syntax/cross-referencing.html). +3. If the build fails: + + Try a full `make clean` to wipe out the `.venv/` directory and start again in + a fresh environment. + +If you still have trouble, you've likely uncovered a bug in the system. +Please [open an issue](https://github.com/GaloisInc/saw-script/issues) detailing +the expected behavior, the observed behavior, and instructions to reproduce the +observed behavior. + +## Understanding `doc/` + +This directory is the intended source root for all SAW documentation. + +### Contents + +- The `Makefile` is explained in more detail [above](#makefile). + +- `conf.py` is our Sphinx configuration, which has been heavily customized from + the default Sphinx provides. + +- `index.md` defines the top-level document hierarchy, and is the intended home + page of . + +- The `bibliography/` directory contains BibTeX (and possibly other) + bibliographies. + +- The `developer/` directory is an assortment of documentation mostly relevant + to developers of `saw-script`, including: + + - Documentation of the SAWCore `extcore` format + - Limitations of SAW and its use + - The `saw-script` release process + + +- The `figures/` directory is used to store image files / similar resources used + throughout SAW documentation. + +- The `pdfs/` directory is where committed versions of PDF renderings live. + +- The `scripts/` directory is where scripts relevant to building the + documentation are located. + This includes a `requirements.txt` pinning dependencies for the Sphinx build. + +- `saw-lexer` is a small Python package implementing a SAWScript `pygments` + lexer, so our documentation understands and properly renders fenced code + blocks labeled ```````sawscript````. + +- `sphinx-download-dir` is a Python package implementing an extension for Sphinx + in the form of a _role_ (essentially, an inline element with some semantics) + for creating full-directory downloads more easily. + See [below](#code-examples) for usage detail. + + +The remaining directories correspond to individual SAW documentation resources +currently managed as part of this documentation ecosystem: + +- `llvm-java-verification-with-saw/` + "LLVM/Java Verification with SAW" + This is a tutorial-style introduction to SAW +- `rust-verification-with-saw/` + "Rust Verification with SAW" + Like the above, but using the experimental MIR features +- `saw-user-manual/` + "SAW User Manual" + The only 'complete' SAW reference right now + +### Non-contents + +These are pieces of documentation in `saw-script` that are _not_ part of the +`doc/` ecosystem: + +- `saw-core/doc/`: A lone TeX file, presumably describing formalism related to + the SAWCore intermediate representation +- `saw-remote-api/docs/`: An old Sphinx setup for SAW's Python remote API + +### Adding new materials + +Sphinx provides a great deal of flexibility, but we prefer to keep that +flexibility in check by maintaining a close correspondence between the directory +structure of `doc/` and the document hierarchy its contents define. + +That said, to add new documentation material: + +1. Create a new directory in `doc/` for your material. + Name it the same as your intended title, or an appropriate transformation (to + e.g. remove spaces/spcial characters). + + Add an `index.md` file that will define the top-level of your material's + hierarchy with a `toctree`. + Feel free to make this `toctree` `:hidden:`, give it a `:caption:`, or make + any other decisions appropriate for your material. + As long as the `toctree` is defined, even if it is hidden Sphinx will do the + right thing to generate HTML/PDFs. + + Do note that, for materials intended to add to `doc/pdfs/`, the contents of + their `index.md` (other than the hierarchical information) will **not** + render in the PDF - this is an intentional choice to allow for some + "HTML-only" content in the front-matter (e.g. code downloads for tutorials). + See [below](#pdf-generation) for information about setting the title of + generated PDFs given this rendering quirk. +2. Create additional `.md` files for your content, organizing them using + `toctree`s as appropriate. + Ideally, for every entry in the top-level `toctree` of your material, there + will be a corresponding `.md` file (or directory of `.md` files, possibly + with its own `index.md` and additional `toctree`s). + This is how we maintain the directory structure / document hierarchy + correspondence. + +Note that your documentation doesn't necessarily _need_ its own directory; if +it's a single page, consider adding it as a single `.md` file to the most +appropriate directory, and adding an entry to the corresponding `toctree` (for +some examples of this, see `doc/developer/`). + +#### Code examples + +It is useful to provide code sample downloads when writing tutorial-style +documentation materials, so readers can easily follow along. + +We have implemented a small Sphinx extension, `sphinx-download-dir`, to make it +easier to provide full-directory downloads in your documents. + +To use this feature: + +1. Create your directory that should be downloadable somewhere in `doc/` (as + long as it's somewhere in the tree, you'll be able to create a download + link). +2. Take note of the directory's location, either relative to `doc/` or relative + to the document where you would like to create a download link (either will + work). +3. Add the link: + + ```markdown + Check out my cool {download-dir}`download`! + ``` + + This will look for `path/to/dir` relative to this document; to look for it + relative to `doc/`, prefix with a `/` + (i.e. ``{download-dir}``). + +That's it! + +#### PDF generation + +Some resources should be turned into PDFs stored in `doc/pdfs`. + +Fortunately, Sphinx allows for the layout of generated TeX / PDFs to be +controlled at a fairly granular level. +The file `doc/conf.py` attempts to simplify this for our purposes. + +In that configuration, there is a variable `pdfs` that holds a reference to a +list of 3-tuples of strings, each of which has the following form: + +`(, , "howto"|"manual")` + +The first entry is the root of the resource, assumed to contain an `index.md` +document defining the `toctree` for the resource. + +The second entry is the title as it will render on the PDF cover page. + +The third entry essentially determines the top-level layout of the material: +`"howto"` results in sections at the top-level, while `"manual"` results in +chapters. +It's a good idea to play with both to find the right fit for your documentation. + +By adding an entry to the `pdfs` list, Sphinx will build a TeX document and +corresonding PDF rooted at the configured directory. +This is indeed how we generate the separate tutorials and user manual, even +though `doc/` is _technically_ one giant document. + +If you need to use this feature, consult `doc/conf.py` for additional +documentation (and to see the tutorials/manual example configurations). + +## For developers + +These notes are primarily for those developers working on the documentation +ecosystem itself. + +### Sphinx doesn't _require_ Markdown + +Because all of the existing SAW documentation was authored in (variants of) +Markdown, it was convenient to use extensions (i.e. `myst-parser`) to quickly +move the existing work into a Sphinx environment without having to translate to +restructuredText. + +This is because Sphinx is a documentation _engine_, capable of handling all +kinds of inputs (and producing all kinds of outputs). +In this way, it is similar to the `pandoc` approach we took previously. + +That said, due to this flexibility, it is possible to mix-and-match reST and +Markdown when authoring documentation - Sphinx will take care of properly parsing +and resolving everything. + +We **strongly** recommend that you stick with Markdown (and the extensions +provided by MyST) unless strictly necessary when authoring SAW documentation, +for consistency with existing materials and to avoid issues resolving, for +example, heading levels. + +### The Python environment + +If you are hacking on the documentation build itself (e.g. configuring Sphinx, +playing with new extensions, or even simply upgrading the pinned dependencies in +), you can manually activate the Python virtual +environment with `.venv/bin/activate`, and deactivate it again with +`deactivate`. + +This ought not be necessary for most work on documentation. diff --git a/doc/tutorial/sawScriptTutorial.bib b/doc/bibliography/saw.bib similarity index 99% rename from doc/tutorial/sawScriptTutorial.bib rename to doc/bibliography/saw.bib index 23ccb20175..ca139382d1 100644 --- a/doc/tutorial/sawScriptTutorial.bib +++ b/doc/bibliography/saw.bib @@ -6,7 +6,7 @@ @book{Lindholm:1999:JVM:553607 edition = {2nd}, publisher = {Addison-Wesley Longman Publishing Co., Inc.}, address = {Boston, MA, USA}, -} +} @Inproceedings{erkok-matthews-cryptolEqChecking-09, author = "Levent {Erk\"{o}k} and John Matthews", diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 0000000000..88ede864fd --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,96 @@ +# -- Project information ----------------------------------------------------- + +project = "SAW Documentation" +copyright = "2025, Galois, Inc" +author = "The SAW Development Team" + +# -- General configuration --------------------------------------------------- + +extensions = [ + "sphinx_copybutton", + "sphinx_download_dir", + "hoverxref.extension", + "notfound.extension", + "myst_parser", +] + +myst_enable_extensions = [ + "amsmath", + "colon_fence", + "deflist", + "dollarmath", + "replacements", + "smartquotes", + "strikethrough", +] + +# -- Options for markup ------------------------------------------------------ + +primary_domain = None + +# -- Options for the nitpicky mode ------------------------------------------- + +nitpicky = True + +# -- Options for source files ------------------------------------------------ + +exclude_patterns = [ + "_build", + "Thumbs.db", + ".DS_Store", + ".venv", + "developer", + "**/code", + "README.md", +] + +# -- Options for HTML output ------------------------------------------------- + +html_theme = "sphinx_rtd_theme" + +# -- Options for LaTeX output ------------------------------------------------ + +pdfs: list[tuple[str, str, str]] = [ + ("llvm-java-verification-with-saw", "LLVM/Java Verification with SAW", "howto"), + ("rust-verification-with-saw", "Rust Verification with SAW", "howto"), + ("saw-user-manual", "SAW User Manual", "manual"), +] +"""Documents we wish to render to separate PDFs. + +These documents are given as 3-tuples of strings: + +(, , 'howto'|'manual') + +Usually, if you add something to the toctree in index.md, it will get an entry +here (or vice-versa). +The most notable exception is development/, where developer documentation (e.g. +the SAWCore external format reference) is stored. + +The document root directory is assumed to contain a document named index, +defining the toctree for that document. +The contents of this index (other than the table of contents it defines) will +_not_ be included in the rendered PDF, so can be used for Web-specific +materials. + +The title is what will render on the PDF's cover page. +This will usually match the title/first heading of the index document. + +'howto' will use \\section as the highest level of organization, while 'manual' +will use \\chapter. +Generally speaking, the former is more appropriate for shorter/tutorial-style +documents, while the latter is best for longer/reference-style documents. +When adding new materials to the SAW documentation, it is worth testing both +styles to see what flows more naturally. +""" + +latex_engine = "xelatex" +latex_documents = [ + (f"{doc_dir}/index", f"{doc_dir}.tex", title, author, style, True) + for doc_dir, title, style in pdfs +] +latex_logo = "figures/galois.pdf" +latex_show_urls = "inline" +latex_elements = { + "pointsize": "12pt", + "fncychap": r"\usepackage[Sonny]{fncychap}", +} diff --git a/doc/extcore.md b/doc/developer/extcore.md similarity index 62% rename from doc/extcore.md rename to doc/developer/extcore.md index c1450b52c2..3db7790763 100644 --- a/doc/extcore.md +++ b/doc/developer/extcore.md @@ -1,9 +1,10 @@ -Introduction -============ +# SAWCore External Format + +## Introduction Consider the following example SAWScript program: -``` +```sawscript main = do { let excluded_middle x = x || not x; print_term excluded_middle; @@ -14,7 +15,7 @@ main = do { The `print_term` command pretty prints the representation of `excluded_middle` as a term in the core language. -``` +```text \(x::Prelude.Bool) -> Prelude.or x (Prelude.not x) ``` @@ -22,7 +23,7 @@ The `write_core` command outputs a file in a more easily computer-readable `extcore` format. (The format can then be read in by the `read_core` command.) -``` +```text SAWCoreTerm 8 1 Data Prelude.Bool 2 Global Prelude.or @@ -54,33 +55,32 @@ In the following table, angle brackets enclose descriptions of each argument. Parentheses and asterisks are used to describe patterns of valid arguments, and will not show up in files. -Form Description ------------------------------------------------------ ----------- -`ExtCns ` External input -`Lam ` Function abstraction -`Pi ` Function type -`Var ` Bound variable (de Bruijn indexed) -`Global ` Toplevel constant -`App ` Function application -`Tuple *` Tuple value -`TupleT *` Tuple type -`TupleSel ` Tuple component selector (`x.1`) -`Record ( )*` Record value -`RecordT ( )*` Record type -`RecordSel ` Record component selector (`x.foo`) -`Ctor *` Data constructor value -`Data *` Datatype -`Sort ` Sort -`Nat ` Non-negative integer literal -`Array *` Array value (e.g. `[1, 2, 3]`) -`Float ` Literal of type `Float` -`Double ` Literal of type `Double` -`String ` Literal of type `String` +| Form | Description | +| ------------------------------------------------------- | ------------------------------------| +| `ExtCns ` | External input | +| `Lam ` | Function abstraction | +| `Pi ` | Function type | +| `Var ` | Bound variable (de Bruijn indexed) | +| `Global ` | Toplevel constant | +| `App ` | Function application | +| `Tuple *` | Tuple value | +| `TupleT *` | Tuple type | +| `TupleSel ` | Tuple component selector (`x.1`) | +| `Record ( )*` | Record value | +| `RecordT ( )*` | Record type | +| `RecordSel ` | Record component selector (`x.foo`) | +| `Ctor *` | Data constructor value | +| `Data *` | Datatype | +| `Sort ` | Sort | +| `Nat ` | Non-negative integer literal | +| `Array *` | Array value (e.g. `[1, 2, 3]`) | +| `Float ` | Literal of type `Float` | +| `Double ` | Literal of type `Double` | +| `String ` | Literal of type `String` | The following sections describe each of these keywords in more detail. -Inputs and Scalar Constants -=========================== +## Inputs and Scalar Constants The simplest terms in `extcore` refer to external inputs and constant values. Two types of external inputs exist. @@ -101,8 +101,7 @@ the "Bit Vectors" section that converts a natural number to a bit vector. Later sections describe how to write aggregated or structured constants. -Applications -============ +## Applications Computations in SAWCore are accomplished by applying operators (or any term of function type) to operands. Application is structured in @@ -111,7 +110,7 @@ to one argument. Functions that take multiple arguments require multiple application nodes. For example, to add two 8-bit bit vectors, we can use the following code: -``` +```text 1 Global Prelude.bitvector 2 Nat 8 3 App 1 2 @@ -130,8 +129,7 @@ then declared on lines 4 and 5. Line 7 then applies the builtin bit vectors). Finally, lines 8 and 9 continue the application to include the two input variables. -Booleans and Bit Vectors -======================== +## Booleans and Bit Vectors The previous section gave an example of a bit vector operation. The SAWCore prelude contains a number of built-in operations on both bit @@ -158,8 +156,7 @@ sum of the lengths of its arguments. The complete collection of bit vector operations appears in the Reference section at the end of this document. -Arrays, Tuples and Records -========================== +## Arrays, Tuples and Records SAWCore allows aggregation of arbitrary data types into composite types: arrays, tuples, and records. Arrays are collections, of known @@ -188,8 +185,7 @@ each type or value index is preceded by a field name. Record field selection is identical to tuple element selection except that it uses a field name instead of an element index. -Function Abstractions -===================== +## Function Abstractions SAWCore allows the creation of function abstractions. The construct `Lam ` causes a function argument value of the given type @@ -208,7 +204,7 @@ As an example, the code presented earlier in the Application section, to add two 8-bit bit vector arguments, could be restructured to use `Lam` and `Var` as follows: -``` +```text 1 Global Prelude.bitvector 2 Nat 8 3 App 1 2 @@ -222,8 +218,7 @@ to add two 8-bit bit vector arguments, could be restructured to use 11 Lam x 3 10 ``` -Custom Data Types -================= +## Custom Data Types Several built-in data types, such as records and tuples, have dedicated syntax within the language. Other data types, however, @@ -253,18 +248,17 @@ first argument, followed by a sequence of element expression indices. Alternatively, vector values can be constructed piece-by-piece using the two data constructors: - * `EmptyVec` which takes a type as an argument and produces a vector - with zero elements of that type, and +- `EmptyVec` which takes a type as an argument and produces a vector + with zero elements of that type, and - * `ConsVec` which takes a type, a value, a size, and an existing - vector of that given size, and produces a new vector of size one - larger, with the given element value at the beginning. +- `ConsVec` which takes a type, a value, a size, and an existing + vector of that given size, and produces a new vector of size one + larger, with the given element value at the beginning. Other type and data constructors exist in the SAWCore prelude, but they rarely occur in terms exported for analysis by third-party tools. -Reference -========= +## Reference This section summarizes the built-in types, boolean functions, and bit vector functions defined in the SAWCore prelude. These types and @@ -272,45 +266,45 @@ functions will apppear in `extcore` files in the form `Prelude.`, but are listed below in the form ``, without the `Prelude` prefix, for brevity and readability. -Prelude types: - -Name Kind Comments ----- ---- -------- -`Bool` `Type` -`Nat` `Type` -`bitvector` `Nat -> Type` Abbreviation for `Vec n Bool` -`Vec` `Nat -> Type -> Type` -`String` `Type` - -Prelude boolean functions: - -Name Type ----- ---- -`and` `Bool -> Bool -> Bool` -`or` `Bool -> Bool -> Bool` -`xor` `Bool -> Bool -> Bool` -`boolEq` `Bool -> Bool -> Bool` -`not` `Bool -> Bool` -`ite` `(a:Type) -> Bool -> a -> a -> a` - -Prelude bit vector functions: - -Name Type ----- ---- -`msb` `(n:Nat) -> bitvector (n + 1) -> Bool` -`bvNat` `(n:Nat) -> Nat -> bitvector n` -`bvToNat` `(n:Nat) -> bitvector n -> Nat` -`bvAdd` `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` -`bvSub` `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` -`bvMul` `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` -`bvUDiv` `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` -`bvURem` `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` -`bvSDiv` `(n:Nat) -> bitvector (n + 1) -> bitvector (n + 1) -> bitvector (n + 1)` -`bvSRem` `(n:Nat) -> bitvector (n + 1) -> bitvector (n + 1) -> bitvector (n + 1)` -`bvAnd` `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` -`bvOr` `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` -`bvXor` `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` -`bvNot` `(n:Nat) -> bitvector n -> bitvector n` -`bvShl` `(n:Nat) -> bitvector n -> Nat -> bitvector n` -`bvShr` `(n:Nat) -> bitvector n -> Nat -> bitvector n` -`bvSShr` `(n:Nat) -> bitvector (n + 1) -> bitvector n -> bitvector (n + 1)` +### Prelude types + +| Name | Kind | Comments | +| ----------- | --------------------- | ----------------------------- | +| `Bool` | `Type` | | +| `Nat` | `Type` | | +| `bitvector` | `Nat -> Type` | Abbreviation for `Vec n Bool` | +| `Vec` | `Nat -> Type -> Type` | | +| `String` | `Type` | | + +### Prelude boolean functions + +| Name | Type | +| -------- | --------------------------------- | +| `and` | `Bool -> Bool -> Bool` | +| `or` | `Bool -> Bool -> Bool` | +| `xor` | `Bool -> Bool -> Bool` | +| `boolEq` | `Bool -> Bool -> Bool` | +| `not` | `Bool -> Bool` | +| `ite` | `(a:Type) -> Bool -> a -> a -> a` | + +### Prelude bit vector functions + +| Name | Type | +| --------- | ------------------------------------------------------------------------ | +| `msb` | `(n:Nat) -> bitvector (n + 1) -> Bool` | +| `bvNat` | `(n:Nat) -> Nat -> bitvector n` | +| `bvToNat` | `(n:Nat) -> bitvector n -> Nat` | +| `bvAdd` | `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` | +| `bvSub` | `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` | +| `bvMul` | `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` | +| `bvUDiv` | `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` | +| `bvURem` | `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` | +| `bvSDiv` | `(n:Nat) -> bitvector (n + 1) -> bitvector (n + 1) -> bitvector (n + 1)` | +| `bvSRem` | `(n:Nat) -> bitvector (n + 1) -> bitvector (n + 1) -> bitvector (n + 1)` | +| `bvAnd` | `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` | +| `bvOr` | `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` | +| `bvXor` | `(n:Nat) -> bitvector n -> bitvector n -> bitvector n` | +| `bvNot` | `(n:Nat) -> bitvector n -> bitvector n` | +| `bvShl` | `(n:Nat) -> bitvector n -> Nat -> bitvector n` | +| `bvShr` | `(n:Nat) -> bitvector n -> Nat -> bitvector n` | +| `bvSShr` | `(n:Nat) -> bitvector (n + 1) -> bitvector n -> bitvector (n + 1)` | diff --git a/doc/limitations.md b/doc/developer/limitations.md similarity index 94% rename from doc/limitations.md rename to doc/developer/limitations.md index 8fe289174c..ec96ddcbe3 100644 --- a/doc/limitations.md +++ b/doc/developer/limitations.md @@ -1,21 +1,5 @@ # Limitations - -**Table of Contents** - -- [Limitations](#limitations) - - [Overview](#overview) - - [Finite Loops](#finite-loops) - - [Identifying and Resolving Non-Termination](#identifying-and-resolving-non-termination) - - [Fixed-size Inputs and Outputs](#fixed-size-inputs-and-outputs) - - [Other Generic Limitations](#other-generic-limitations) - - [Other Java Limitations](#other-java-limitations) - - [Other LLVM Limitations](#other-llvm-limitations) - - [Crucible LLVM Limitations](#crucible-llvm-limitations) - - - - ## Overview There are a few limitations on the types of code that SAW can process. diff --git a/doc/RELEASING.md b/doc/developer/releasing.md similarity index 65% rename from doc/RELEASING.md rename to doc/developer/releasing.md index 69a9cfbbd9..70bde35c7f 100644 --- a/doc/RELEASING.md +++ b/doc/developer/releasing.md @@ -22,22 +22,25 @@ environment of a release consistent. The release process is: 1. Make sure the `release-n.n` branch is in a release/ready state, including: + - successful build artifacts across all platforms, - - successful tests on all test suites, and - - an up-to-date [CHANGES.md](CHANGES.md) file. -1. Create a draft release on GitHub -1. Make a commit on the `release-n.n` branch updating the version in the + - successful tests on all test suites, + - an up-to-date [`SOURCE_DATE_EPOCH`](../.env) for documentation builds, + - up-to-date [documentation PDFs](../pdfs), and + - an up-to-date [CHANGES.md](../../CHANGES.md) file. + +2. Create a draft release on GitHub +3. Make a commit on the `release-n.n` branch updating the version in the `saw-script.cabal` file to `n.n`. This will trigger a build. -1. Once the build is done, download the release artifacts from the +4. Once the build is done, download the release artifacts from the outputs of the build. These are `.zip` files containing `.tar.gz` and `.sig` files. -1. Unpack the `.zip` files and attach the `.tar.gz` and `.sig` files to +5. Unpack the `.zip` files and attach the `.tar.gz` and `.sig` files to the draft release. -1. Publish the release. This will add a tag to the release branch, as +6. Publish the release. This will add a tag to the release branch, as well. -1. Announce the release and update release-relevant information in the following places: +7. Announce the release and update release-relevant information in the following places: - - - - - - - - @galois on Twitter (for major releases) + - + - + - @galois on Twitter (for major releases) diff --git a/doc/manual/figures/galois.pdf b/doc/figures/galois.pdf similarity index 100% rename from doc/manual/figures/galois.pdf rename to doc/figures/galois.pdf diff --git a/doc/index.md b/doc/index.md new file mode 100644 index 0000000000..e053cba0db --- /dev/null +++ b/doc/index.md @@ -0,0 +1,34 @@ +# SAW Documentation + +Welcome to the Software Analysis Workbench (SAW)! + +If you are a new SAW user: + +- [**LLVM/Java Verification with SAW**](llvm-java-verification-with-saw/index) + introduces the core ideas of using SAW to verify LLVM/Java programs. + This is the fastest way to get started with SAW for most use-cases. +- [**Rust Verification with SAW**](rust-verification-with-saw/index) introduces + SAW's experimental support for Rust/MIR verification. + If you would like to be introduced to SAW at the cutting-edge, this is the + right place to start (the LLVM/Java material is _not_ prerequisite). + +If you are looking for SAW/SAWScript reference materials: + +- The [**SAW User Manual**](saw-user-manual/index) is an attempt at + comprehensive user-level documentation of SAW's features. + If the tutorials above don't introduce something you've encountered while + reading or writing SAWScript, it is worth searching the manual to see if it is + covered there. + +If you encounter any bugs, please [open an +issue](https://github.com/GaloisInc/saw-script/issues) following the +[contributing +guidelines](https://github.com/GaloisInc/saw-script/blob/master/CONTRIBUTING.md#issue-tracking). + +:::{toctree} +:hidden: + +llvm-java-verification-with-saw/index +rust-verification-with-saw/index +saw-user-manual/index +::: diff --git a/doc/tutorial/code/Add.class b/doc/llvm-java-verification-with-saw/code/Add.class similarity index 100% rename from doc/tutorial/code/Add.class rename to doc/llvm-java-verification-with-saw/code/Add.class diff --git a/doc/tutorial/code/Add.java b/doc/llvm-java-verification-with-saw/code/Add.java similarity index 100% rename from doc/tutorial/code/Add.java rename to doc/llvm-java-verification-with-saw/code/Add.java diff --git a/doc/tutorial/code/Cipher.cry b/doc/llvm-java-verification-with-saw/code/Cipher.cry similarity index 100% rename from doc/tutorial/code/Cipher.cry rename to doc/llvm-java-verification-with-saw/code/Cipher.cry diff --git a/doc/tutorial/code/DES.cry b/doc/llvm-java-verification-with-saw/code/DES.cry similarity index 100% rename from doc/tutorial/code/DES.cry rename to doc/llvm-java-verification-with-saw/code/DES.cry diff --git a/doc/tutorial/code/FFS.class b/doc/llvm-java-verification-with-saw/code/FFS.class similarity index 100% rename from doc/tutorial/code/FFS.class rename to doc/llvm-java-verification-with-saw/code/FFS.class diff --git a/doc/tutorial/code/FFS.java b/doc/llvm-java-verification-with-saw/code/FFS.java similarity index 100% rename from doc/tutorial/code/FFS.java rename to doc/llvm-java-verification-with-saw/code/FFS.java diff --git a/doc/tutorial/code/Makefile b/doc/llvm-java-verification-with-saw/code/Makefile similarity index 100% rename from doc/tutorial/code/Makefile rename to doc/llvm-java-verification-with-saw/code/Makefile diff --git a/doc/tutorial/code/NQueens.cry b/doc/llvm-java-verification-with-saw/code/NQueens.cry similarity index 100% rename from doc/tutorial/code/NQueens.cry rename to doc/llvm-java-verification-with-saw/code/NQueens.cry diff --git a/doc/tutorial/code/WARNING b/doc/llvm-java-verification-with-saw/code/WARNING similarity index 100% rename from doc/tutorial/code/WARNING rename to doc/llvm-java-verification-with-saw/code/WARNING diff --git a/doc/tutorial/code/basic.bc b/doc/llvm-java-verification-with-saw/code/basic.bc similarity index 100% rename from doc/tutorial/code/basic.bc rename to doc/llvm-java-verification-with-saw/code/basic.bc diff --git a/doc/tutorial/code/basic.c b/doc/llvm-java-verification-with-saw/code/basic.c similarity index 100% rename from doc/tutorial/code/basic.c rename to doc/llvm-java-verification-with-saw/code/basic.c diff --git a/doc/tutorial/code/des-cryptol2.saw b/doc/llvm-java-verification-with-saw/code/des-cryptol2.saw similarity index 100% rename from doc/tutorial/code/des-cryptol2.saw rename to doc/llvm-java-verification-with-saw/code/des-cryptol2.saw diff --git a/doc/tutorial/code/des3.saw b/doc/llvm-java-verification-with-saw/code/des3.saw similarity index 100% rename from doc/tutorial/code/des3.saw rename to doc/llvm-java-verification-with-saw/code/des3.saw diff --git a/doc/tutorial/code/dotprod.bc b/doc/llvm-java-verification-with-saw/code/dotprod.bc similarity index 100% rename from doc/tutorial/code/dotprod.bc rename to doc/llvm-java-verification-with-saw/code/dotprod.bc diff --git a/doc/tutorial/code/dotprod.c b/doc/llvm-java-verification-with-saw/code/dotprod.c similarity index 100% rename from doc/tutorial/code/dotprod.c rename to doc/llvm-java-verification-with-saw/code/dotprod.c diff --git a/doc/tutorial/code/dotprod.cry b/doc/llvm-java-verification-with-saw/code/dotprod.cry similarity index 100% rename from doc/tutorial/code/dotprod.cry rename to doc/llvm-java-verification-with-saw/code/dotprod.cry diff --git a/doc/tutorial/code/dotprod.saw b/doc/llvm-java-verification-with-saw/code/dotprod.saw similarity index 100% rename from doc/tutorial/code/dotprod.saw rename to doc/llvm-java-verification-with-saw/code/dotprod.saw diff --git a/doc/tutorial/code/double.bc b/doc/llvm-java-verification-with-saw/code/double.bc similarity index 100% rename from doc/tutorial/code/double.bc rename to doc/llvm-java-verification-with-saw/code/double.bc diff --git a/doc/tutorial/code/double.c b/doc/llvm-java-verification-with-saw/code/double.c similarity index 100% rename from doc/tutorial/code/double.c rename to doc/llvm-java-verification-with-saw/code/double.c diff --git a/doc/tutorial/code/double.saw b/doc/llvm-java-verification-with-saw/code/double.saw similarity index 100% rename from doc/tutorial/code/double.saw rename to doc/llvm-java-verification-with-saw/code/double.saw diff --git a/doc/tutorial/code/ffs.bc b/doc/llvm-java-verification-with-saw/code/ffs.bc similarity index 100% rename from doc/tutorial/code/ffs.bc rename to doc/llvm-java-verification-with-saw/code/ffs.bc diff --git a/doc/tutorial/code/ffs.c b/doc/llvm-java-verification-with-saw/code/ffs.c similarity index 100% rename from doc/tutorial/code/ffs.c rename to doc/llvm-java-verification-with-saw/code/ffs.c diff --git a/doc/tutorial/code/ffs.cry b/doc/llvm-java-verification-with-saw/code/ffs.cry similarity index 100% rename from doc/tutorial/code/ffs.cry rename to doc/llvm-java-verification-with-saw/code/ffs.cry diff --git a/doc/tutorial/code/ffs_compare.saw b/doc/llvm-java-verification-with-saw/code/ffs_compare.saw similarity index 100% rename from doc/tutorial/code/ffs_compare.saw rename to doc/llvm-java-verification-with-saw/code/ffs_compare.saw diff --git a/doc/tutorial/code/ffs_compare_aig.saw b/doc/llvm-java-verification-with-saw/code/ffs_compare_aig.saw similarity index 100% rename from doc/tutorial/code/ffs_compare_aig.saw rename to doc/llvm-java-verification-with-saw/code/ffs_compare_aig.saw diff --git a/doc/tutorial/code/ffs_gen_aig.saw b/doc/llvm-java-verification-with-saw/code/ffs_gen_aig.saw similarity index 100% rename from doc/tutorial/code/ffs_gen_aig.saw rename to doc/llvm-java-verification-with-saw/code/ffs_gen_aig.saw diff --git a/doc/tutorial/code/ffs_java.saw b/doc/llvm-java-verification-with-saw/code/ffs_java.saw similarity index 100% rename from doc/tutorial/code/ffs_java.saw rename to doc/llvm-java-verification-with-saw/code/ffs_java.saw diff --git a/doc/tutorial/code/ffs_java_crucible.saw b/doc/llvm-java-verification-with-saw/code/ffs_java_crucible.saw similarity index 100% rename from doc/tutorial/code/ffs_java_crucible.saw rename to doc/llvm-java-verification-with-saw/code/ffs_java_crucible.saw diff --git a/doc/tutorial/code/ffs_llvm.saw b/doc/llvm-java-verification-with-saw/code/ffs_llvm.saw similarity index 100% rename from doc/tutorial/code/ffs_llvm.saw rename to doc/llvm-java-verification-with-saw/code/ffs_llvm.saw diff --git a/doc/tutorial/code/java_add.saw b/doc/llvm-java-verification-with-saw/code/java_add.saw similarity index 100% rename from doc/tutorial/code/java_add.saw rename to doc/llvm-java-verification-with-saw/code/java_add.saw diff --git a/doc/tutorial/code/nqueens.saw b/doc/llvm-java-verification-with-saw/code/nqueens.saw similarity index 100% rename from doc/tutorial/code/nqueens.saw rename to doc/llvm-java-verification-with-saw/code/nqueens.saw diff --git a/doc/tutorial/code/picosat.saw b/doc/llvm-java-verification-with-saw/code/picosat.saw similarity index 100% rename from doc/tutorial/code/picosat.saw rename to doc/llvm-java-verification-with-saw/code/picosat.saw diff --git a/doc/llvm-java-verification-with-saw/compositional-proofs.md b/doc/llvm-java-verification-with-saw/compositional-proofs.md new file mode 100644 index 0000000000..4e78513387 --- /dev/null +++ b/doc/llvm-java-verification-with-saw/compositional-proofs.md @@ -0,0 +1,54 @@ +# Compositional Proofs + +The examples shown so far treat programs as monolithic entities. A +Java method or C function, along with all of its callees, is +translated into a single mathematical model. SAWScript also has +support for more compositional proofs, as well as proofs about +functions that use heap data structures. + +## Compositional Imperative Proofs + +As a simple example of compositional reasoning on imperative programs, +consider the following Java code. + +:::{literalinclude} code/Add.java +:language: java +::: + +Here, the `add` function computes the sum of its arguments. The `dbl` +function then calls `add` to double its argument. While it would be easy +to prove that `dbl` doubles its argument by following the call to `add`, +it's also possible in SAWScript to prove something about `add` first, +and then use the results of that proof in the proof of `dbl`, as in the +following SAWScript code (`java_add.saw` on GitHub). + +:::{literalinclude} code/java_add.saw +:language: sawscript +::: + +This can be run as follows: + +:::{code-block} console +$ saw -b java_add.saw +::: + +In this example, the definitions of `add_spec` and `dbl_spec` provide +extra information about how to configure the symbolic simulator when +analyzing Java code. In this case, the setup blocks provide explicit +descriptions of the implicit configuration used by +`jvm_extract` (used in the earlier Java FFS example and in the +next section). The `jvm_fresh_var` commands instruct the simulator to +create fresh symbolic inputs to correspond to the Java variables `x` and +`y`. Then, the `jvm_return` commands indicate the expected return value +of the each method, in terms of existing models (which can be written +inline). Because Java methods can operate on references, as well, which +do not exist in Cryptol, Cryptol expressions must be translated to JVM +values with `jvm_term`. + +To make use of these setup blocks, the `jvm_verify` command analyzes +the method corresponding to the class and method name provided, using +the setup block passed in as a parameter. It then returns an object +that describes the proof it has just performed. This object can be +passed into later instances of `jvm_verify` to indicate that calls to +the analyzed method do not need to be followed, and the previous proof +about that method can be used instead of re-analyzing it. diff --git a/doc/llvm-java-verification-with-saw/example-find-first-set.md b/doc/llvm-java-verification-with-saw/example-find-first-set.md new file mode 100644 index 0000000000..9ac5aba229 --- /dev/null +++ b/doc/llvm-java-verification-with-saw/example-find-first-set.md @@ -0,0 +1,239 @@ +# Example: Find First Set + +As a first example, we consider showing the equivalence of several quite +different implementations of the POSIX `ffs` function, which identifies +the position of the first ``1`` bit in a word. The function takes an +integer as input, treated as a vector of bits, and returns another +integer which indicates the index of the first bit set (zero being the +least significant). This function can be implemented in several ways +with different performance and code clarity tradeoffs, and we would like +to show those different implementations are equivalent. + +## Reference Implementation + +One simple implementation takes the form of a loop with an index +initialized to zero, and a mask initialized to have the least +significant bit set. On each iteration, we increment the index, and +shift the mask to the left. Then we can use a bitwise "and" operation to +test the bit at the index indicated by the index variable. The following +C code (which is also in the `ffs.c` file on GitHub) uses this approach. + +:::{literalinclude} code/ffs.c +:lines: 9-17 +:language: c +::: + +This implementation is relatively straightforward, and a proficient C +programmer would probably have little difficulty believing its +correctness. However, the number of branches taken during execution +could be as many as 32, depending on the input value. It's possible to +implement the same algorithm with significantly fewer branches, and no +backward branches. + +## Alternative Implementations + +An alternative implementation, taken by the following function (also in +`ffs.c`), treats the bits of the input word in chunks, allowing +sequences of zero bits to be skipped over more quickly. + +:::{literalinclude} code/ffs.c +:lines: 19-26 +:language: c +::: + +Another optimized version, in the following rather mysterious program +(also in `ffs.c`), based on the `ffs` implementation in [musl +libc](http://musl.libc.org/). + +:::{literalinclude} code/ffs.c +:lines: 69-76 +:language: c +::: + +These optimized versions are much less obvious than the reference +implementation. They might be faster, but how do we gain confidence +that they calculate the same results as the reference implementation? + +Finally, consider a buggy implementation which is correct on all but one +possible input (also in `ffs.c`). Although contrived, this program +represents a case where traditional testing -- as opposed to +verification -- is unlikely to be helpful. + +:::{literalinclude} code/ffs.c +:lines: 43-47 +:language: c +::: + +SAWScript allows us to state these problems concisely, and to quickly +and automatically 1) prove the equivalence of the reference and +optimized implementations on all possible inputs, and 2) find an input +exhibiting the bug in the third version. + +## Generating LLVM Code + +SAW can analyze LLVM code, but most programs are originally written in a +higher-level language such as C, as in our example. Therefore, the C +code must be translated to LLVM, using something like the following +command: + +:::{code-block} console +$ clang -g -c -emit-llvm -o ffs.bc ffs.c +::: + +The `-g` flag instructs `clang` to include debugging information, which +is useful in SAW to refer to variables and struct fields using the same +names as in C. We have tested SAW successfully with versions of `clang` +from 3.6 to 7.0. Please report it as a bug [on +GitHub](https://github.com/GaloisInc/saw-script/issues) if SAW fails to +parse any LLVM bitcode file. + +This command, and following command examples in this tutorial, can be +run from the `code` directory [on +GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/tutorial/code). +A `Makefile` also exists in that directory, providing quick shortcuts +for tasks like this. For instance, we can get the same effect as the +previous command by running: + +:::{code-block} console +$ make ffs.bc +::: + +## Equivalence Proof + +We now show how to use SAWScript to prove the equivalence of the +reference and implementation versions of the FFS algorithm, and +exhibit the bug in the buggy implementation. + +A SAWScript program is typically structured as a sequence of commands, +potentially along with definitions of functions that abstract over +commonly-used combinations of commands. + +The following script (in `ffs_llvm.saw`) is sufficient to automatically +prove the equivalence of `ffs_ref` with `ffs_imp` and `ffs_musl`, and +identify the bug in `ffs_bug`. + +:::{literalinclude} code/ffs_llvm.saw +:language: sawscript +::: + +In this script, the `print` commands simply display text for the user. +The `llvm_extract` command instructs the SAWScript interpreter +to perform symbolic simulation of the given C function (e.g., `ffs_ref`) +from a given bitcode file (e.g., `ffs.bc`), and return a term +representing the semantics of the function. + +The `let` statement then constructs a new term corresponding to the +assertion of equality between two existing terms. Arbitrary +Cryptol expressions can be embedded within SAWScript; to distinguish +Cryptol code from SAWScript commands, the Cryptol code is placed +within double brackets `{{` and `}}`. + +The `prove` command can verify the validity of such an assertion, or +produce a counter-example that invalidates it. The `abc` parameter +indicates what theorem prover to use; SAWScript offers support for many +other SAT and SMT solvers as well as user definable simplification +tactics. + +Similarly, the `sat` command works in the opposite direction to `prove`. +It attempts to find a value for which the given assertion is true, and +fails if there is no such value. + +If the `saw` executable is in your PATH, you can run the script above with + +:::{code-block} console +$ saw ffs_llvm.saw + + + +Loading file "ffs_llvm.saw" +Extracting reference term: ffs_ref +Extracting implementation term: ffs_imp +Extracting implementation term: ffs_musl +Extracting buggy term: ffs_bug +Proving equivalence: ffs_ref == ffs_imp +Valid +Proving equivalence: ffs_ref == ffs_musl +Valid +Finding bug via sat search: ffs_ref != ffs_bug +Sat: [x = 0x101010] +Finding bug via failed proof: ffs_ref == ffs_bug +prove: 1 unsolved subgoal(s) +Invalid: [x = 0x101010] +Done. +::: + +Note that both explicitly searching for an input exhibiting the bug +(with `sat`) and attempting to prove the false equivalence (with +`prove`) exhibit the bug. Symmetrically, we could use `sat` to prove the +equivalence of `ffs_ref` and `ffs_imp`, by checking that the +corresponding disequality is unsatisfiable. Indeed, this exactly what +happens behind the scenes: `prove abc ` is essentially `not (sat +abc (not ))`. + +## Cross-Language Proofs + +We can implement the FFS algorithm in Java with code almost identical +to the C version. + +The reference version (in `FFS.java`) uses a loop, like the C version: + +:::{literalinclude} code/FFS.java +:lines: 2-10 +:language: java +::: + +And the efficient implementation uses a fixed sequence of masking and +shifting operations: + +:::{literalinclude} code/FFS.java +:lines: 12-19 +:language: java +::: + +Although in this case we can look at the C and Java code and see that +they perform almost identical operations, the low-level operators +available in C and Java do differ somewhat. Therefore, it would be +nice to be able to gain confidence that they do, indeed, perform the +same operation. + +We can do this with a process very similar to that used to compare the +reference and implementation versions of the algorithm in a single +language. + +First, we compile the Java code to a JVM class file. + +:::{code-block} console +$ javac -g FFS.java +::: + +Like with `clang`, the `-g` flag instructs `javac` to include debugging +information, which can be useful to preserve variable names. + +Using `saw` with Java code requires a command-line option `-b` that +locates Java. Run the code in this section with the command: + +:::{code-block} console +$ saw -b ffs_compare.saw +::: + +Alternatively, if Java is located on your `PATH`, you can omit the `-b` +option entirely. + +Both Oracle JDK and OpenJDK versions 6 through 8 work well with SAW. +SAW also includes experimental support for JDK 9 and later. Code that only +involves primitive data types (such as `FFS.java` above) is known to work well +under JDK 9+, but there are some as-of-yet unresolved issues in verifying code +involving classes such as `String`. For more information on these issues, refer +to [this GitHub issue](https://github.com/GaloisInc/crucible/issues/641). + +Now we can do the proof both within and across languages (from +`ffs_compare.saw`): + +:::{literalinclude} code/ffs_compare.saw +:language: sawscript +::: + +Here, the `jvm_extract` function works like `llvm_extract`, but on a +Java class and method name. The `prove_print` command works similarly +to the `prove` followed by `print` combination used for the LLVM +example above. diff --git a/doc/llvm-java-verification-with-saw/external-sat-solvers.md b/doc/llvm-java-verification-with-saw/external-sat-solvers.md new file mode 100644 index 0000000000..5e38363d6a --- /dev/null +++ b/doc/llvm-java-verification-with-saw/external-sat-solvers.md @@ -0,0 +1,36 @@ +# External SAT Solvers + +In addition to the `abc`, `z3`, and `yices` proof tactics used +above, SAWScript can also invoke arbitrary external SAT solvers +that read CNF files and produce results according to the SAT +competition +[input and output conventions](https://jix.github.io/varisat/manual/0.2.0/formats/dimacs.html), +using the `external_cnf_solver` tactic. For example, you can use +[PicoSAT](http://fmv.jku.at/picosat/) to prove the theorem `thm` from +the last example, with the following commands: + +:::{code-block} sawscript +let picosat = external_cnf_solver "picosat" ["%f"]; +prove_print picosat thm;x +::: + +The use of `let` is simply a convenient abbreviation. The following +would be equivalent: + +:::{code-block} sawscript +prove_print (external_cnf_solver "picosat" ["%f"]) thm; +::: + +The first argument to `external_cnf_solver` is the name of the +executable. It can be a fully-qualified name, or simply the bare +executable name if it's in your PATH. The second argument is an array +of command-line arguments to the solver. Any occurrence of `%f` is +replaced with the name of the temporary file containing the CNF +representation of the term you're proving. + +The `external_cnf_solver` tactic is based on the same underlying +infrastructure as the `abc` tactic, and is generally capable of +proving the same variety of theorems. + +To write a CNF file without immediately invoking a solver, use the +`offline_cnf` tactic, or the `write_cnf` top-level command. diff --git a/doc/llvm-java-verification-with-saw/index.md b/doc/llvm-java-verification-with-saw/index.md new file mode 100644 index 0000000000..d0b6f0909f --- /dev/null +++ b/doc/llvm-java-verification-with-saw/index.md @@ -0,0 +1,25 @@ +# LLVM/Java Verification with SAW + +This tutorial introduces SAW's LLVM/Java verification capabilities. +After completing this tutorial, you will be able to: + +- Show that alternative implementations of a function, even across languages, + are equivalent +- Use external SMT/SAT solvers to augment SAW's internal proof system +- Write proofs in SAW _compositionally_ +- Use the SAW REPL to interactively run proofs + +You can download {download-dir}`the tutorial files `. + +```{toctree} +:maxdepth: 2 +:caption: Contents + +introduction +example-find-first-set +using-smt-lib-solvers +external-sat-solvers +compositional-proofs +interactive-interpreter +other-examples +``` diff --git a/doc/llvm-java-verification-with-saw/interactive-interpreter.md b/doc/llvm-java-verification-with-saw/interactive-interpreter.md new file mode 100644 index 0000000000..eae1a95325 --- /dev/null +++ b/doc/llvm-java-verification-with-saw/interactive-interpreter.md @@ -0,0 +1,74 @@ +# Interactive Interpreter + +The examples so far have used SAWScript in batch mode on complete +script files. It also has an interactive Read-Eval-Print Loop (REPL) +which can be convenient for experimentation. To start the REPL, run +SAWScript with no arguments: + +:::{code-block} console +$ saw +::: + +The REPL can evaluate any command that would appear at the top level +of a standalone script, or in the `main` function, as well as a few +special commands that start with a colon: + +:::{code-block} console + + :env display the current sawscript environment + :type check the type of an expression + :browse display the current environment + :eval evaluate an expression and print the result + :? display a brief description about a built-in operator + :help display a brief description about a built-in operator + :quit exit the REPL + :load load a module + :add load an additional module + :cd set the current working directory +::: + +As an example of the sort of interactive use that the REPL allows, +consider the file `code/NQueens.cry`, which provides a Cryptol +specification of the problem of placing a specific number of queens on +a chess board in such a way that none of them threaten any of the +others. + +:::{literalinclude} code/NQueens.cry +:lines: 21-56 +:language: cryptol +::: + +This example gives us the opportunity to use the satisfiability +checking capabilities of SAWScript on a problem other than +equivalence verification. + +First, we can load a model of the `nQueens` term from the Cryptol file. + +:::{code-block} console +sawscript> m <- cryptol_load "NQueens.cry" +sawscript> let nq8 = {{ m::nQueens`{8} }} +::: + +Once we've extracted this model, we can try it on a specific +configuration to see if it satisfies the property that none of the +queens threaten any of the others. + +:::{code-block} console +sawscript> print {{ nq8 [0,1,2,3,4,5,6,7] }} +False +::: + +This particular configuration didn't work, but we can use the +satisfiability checking tools to automatically find one that does. + +:::{code-block} console +sawscript> sat_print abc nq8 +Sat [qs = [3, 1, 6, 2, 5, 7, 4, 0]] +::: + +And, finally, we can double-check that this is indeed a valid solution. + +:::{code-block} console +sawscript> print {{ nq8 [3,1,6,2,5,7,4,0] }} +True +::: diff --git a/doc/llvm-java-verification-with-saw/introduction.md b/doc/llvm-java-verification-with-saw/introduction.md new file mode 100644 index 0000000000..4e0e45a760 --- /dev/null +++ b/doc/llvm-java-verification-with-saw/introduction.md @@ -0,0 +1,21 @@ +# Introduction + +SAWScript is a special-purpose programming language developed by +Galois to help orchestrate and track the results of the large +collection of proof tools necessary for analysis and verification of +complex software artifacts. + +The language adopts the functional paradigm, and largely follows the +structure of many other typed functional languages, with some special +features specifically targeted at the coordination of verification and +analysis tasks. + +This tutorial introduces the details of the language by walking through +several examples, and showing how simple verification tasks can be +described. The complete examples are available [on +GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/tutorial/code). +Most of the examples make use of inline specifications written in +Cryptol, a language originally designed for high-level descriptions of +cryptographic algorithms. For readers unfamiliar with Cryptol, various +documents describing its use are available +[here](http://cryptol.net/documentation.html). diff --git a/doc/llvm-java-verification-with-saw/other-examples.md b/doc/llvm-java-verification-with-saw/other-examples.md new file mode 100644 index 0000000000..dd033d369d --- /dev/null +++ b/doc/llvm-java-verification-with-saw/other-examples.md @@ -0,0 +1,62 @@ +# Other Examples + +The `code` directory [on +GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/tutorial/code) +contains a few additional examples not mentioned so far. These remaining +examples don't cover significant new material, but help fill in some +extra use cases that are similar, but not identical to those already +covered. + +## Java Equivalence Checking + +The previous examples showed comparison between two different LLVM +implementations, and cross-language comparisons between Cryptol, Java, +and LLVM. The script in `ffs_java.saw` compares two different Java +implementations, instead. + +:::{literalinclude} code/ffs_java.saw +:language: sawscript +::: + +As with previous Java examples, this one needs to be run with the `-b` +flag to tell the interpreter where to find Java: + +:::{code-block} console +$ saw -b ffs_java.saw +::: + +## AIG Export and Import + +Most of the previous examples have used the `abc` tactic to discharge +theorems. This tactic works by translating the given term to +And-Inverter Graph (AIG) format, transforming the graph in various +ways, and then using a SAT solver to complete the proof. + +Alternatively, the `write_aig` command can be used to write an AIG +directly to a file, in [AIGER format](http://fmv.jku.at/aiger/), for +later processing by external tools, as shown in +`code/ffs_gen_aig.saw`. + +:::{literalinclude} code/ffs_gen_aig.saw +:language: sawscript +::: + +Conversely, the `read_aig` command can construct an internal term from +an existing AIG file, as shown in `ffs_compare_aig.saw`. + +:::{literalinclude} code/ffs_compare_aig.saw +:language: sawscript +::: + +We can use external AIGs to verify the equivalence as follows, +generating the AIGs with the first script and comparing them with the +second: + +:::{code-block} console +$ saw -b ffs_gen_aig.saw +$ saw ffs_compare_aig.saw +::: + +Files in AIGER format can be produced and processed by several +external tools, including ABC, Cryptol version 1, and various hardware +synthesis and verification systems. diff --git a/doc/llvm-java-verification-with-saw/using-smt-lib-solvers.md b/doc/llvm-java-verification-with-saw/using-smt-lib-solvers.md new file mode 100644 index 0000000000..a2e73cb9db --- /dev/null +++ b/doc/llvm-java-verification-with-saw/using-smt-lib-solvers.md @@ -0,0 +1,42 @@ +# Using SMT-Lib Solvers + +The examples presented so far have used the internal proof system +provided by SAWScript, based primarily on a version of the ABC tool +from UC Berkeley linked into the `saw` executable. However, there is +internal support for other proof tools -- such as Yices and Z3 as +illustrated in the next example -- and more general support for +exporting models representing theorems as goals in the SMT-Lib +language. These exported goals can then be solved using an external +SMT solver. + +Consider the following C file: + +:::{literalinclude} code/double.c +:language: c +::: + +In this trivial example, an integer can be doubled either using +multiplication or shifting. The following SAWScript program (in +`double.saw`) verifies that the two are equivalent using both internal +Yices and Z3 modes, and by exporting an SMT-Lib theorem to be +checked later, by an external SAT solver. + +:::{literalinclude} code/double.saw +:language: sawscript +::: + +The new primitives introduced here are the tilde operator, `~`, which +constructs the logical negation of a term, and `write_smtlib2`, which +writes a term as a proof obligation in SMT-Lib version 2 format. Because +SMT solvers are satisfiability solvers, their default behavior is to +treat free variables as existentially quantified. By negating the input +term, we can instead treat the free variables as universally quantified: +a result of "unsatisfiable" from the solver indicates that the original +term (before negation) is a valid theorem. The `prove` primitive does +this automatically, but for flexibility the `write_smtlib2` primitive +passes the given term through unchanged, because it might be used for +either satisfiability or validity checking. + +The SMT-Lib export capabilities in SAWScript make use of the Haskell SBV +package, and support ABC, Bitwuzla, Boolector, CVC4, CVC5, MathSAT, Yices, and +Z3. diff --git a/doc/manual/Makefile b/doc/manual/Makefile deleted file mode 100644 index 9733ce9e24..0000000000 --- a/doc/manual/Makefile +++ /dev/null @@ -1,38 +0,0 @@ -TARGET = manual -SRCS = ${TARGET}.md ${wildcard *.bib} ${wildcard *.cls} ${wildcard *.sty} -CODE = ${wildcard code/*.c} \ - ${wildcard code/*.cry} \ - ${wildcard code/*.java} \ - ${wildcard code/*.saw} -AUX = ${wildcard *.blg} ${wildcard *.bbl} ${wildcard *.aux} \ - ${wildcard *.eps} ${wildcard *.log} ${wildcard *.toc} -PDFARGS = -H head.tex \ - -B before.tex \ - --listings \ - -V documentclass:galois-whitepaper \ - -V fontsize:12 \ - --toc \ - --pdf-engine=pdflatex -HTMLARGS = --css doc.css \ - --standalone \ - --toc \ - --self-contained - -all: pdf - -pdf: ${TARGET}.pdf - -${TARGET}.pdf: ${TARGET}.md Makefile galois-whitepaper.cls - pandoc ${PDFARGS} -o $@ ${TARGET}.md - -${TARGET}.tex: ${TARGET}.md Makefile galois-whitepaper.cls - pandoc ${PDFARGS} -o $@ ${TARGET}.md - -${TARGET}.html: ${TARGET}.md Makefile - pandoc ${HTMLARGS} -o $@ ${TARGET}.md - -clean: - rm -f ${AUX} - -superClean: clean - rm -f ${TARGET}.pdf ${TARGET}.html diff --git a/doc/manual/before.tex b/doc/manual/before.tex deleted file mode 100644 index 2a8b21e849..0000000000 --- a/doc/manual/before.tex +++ /dev/null @@ -1 +0,0 @@ -\maketitle diff --git a/doc/manual/code/Add.class b/doc/manual/code/Add.class deleted file mode 100644 index f834d518c7..0000000000 Binary files a/doc/manual/code/Add.class and /dev/null differ diff --git a/doc/manual/code/Add.java b/doc/manual/code/Add.java deleted file mode 100644 index 6dc4f499c5..0000000000 --- a/doc/manual/code/Add.java +++ /dev/null @@ -1,9 +0,0 @@ -class Add { - public int add(int x, int y) { - return x + y; - } - - public int dbl(int x) { - return add(x, x); - } -} diff --git a/doc/manual/code/java_symexec.saw b/doc/manual/code/java_symexec.saw deleted file mode 100644 index 40f8748755..0000000000 --- a/doc/manual/code/java_symexec.saw +++ /dev/null @@ -1,15 +0,0 @@ -// show that add(x,y) == add(y,x) for all x and y -enable_deprecated; -cadd <- java_load_class "Add"; -x <- fresh_symbolic "x" {| [32] |}; -y <- fresh_symbolic "y" {| [32] |}; -res <- java_symexec cadd "add" [("x", x), ("y", y)] ["return", "$safety"] true; -let jadd = rewrite basic_ss {{ res.0 }}; -let safe = {{ res.1 }}; -jadd' <- abstract_symbolic jadd; -print_term jadd'; -print "Proving commutivity:"; -prove_print abc {{ \a b -> jadd' a b == jadd' b a }}; -print "Proving safety:"; -prove_print abc safe; -print "Done."; diff --git a/doc/manual/code/loop.bc b/doc/manual/code/loop.bc deleted file mode 100644 index 8a6886c3bd..0000000000 Binary files a/doc/manual/code/loop.bc and /dev/null differ diff --git a/doc/manual/code/loop.c b/doc/manual/code/loop.c deleted file mode 100644 index fca7f4fae8..0000000000 --- a/doc/manual/code/loop.c +++ /dev/null @@ -1,10 +0,0 @@ -#include - -uint8_t f(uint8_t i) { - int done = 0; - while (!done){ - if (i % 8 == 0) done = 1; - i += 5; - } - return i; -} diff --git a/doc/manual/doc.css b/doc/manual/doc.css deleted file mode 100644 index 08327aa2df..0000000000 --- a/doc/manual/doc.css +++ /dev/null @@ -1,93 +0,0 @@ -body { - font-size: 14px; - line-height: 21px; - color: #333; - font-family: "Helvetica Neue", "Lucida Grande", "Lucida Sans Unicode", Helvetica, Arial, sans-serif !important; -} -.container { - width: 950px; - margin: 0; - padding: 80px 0px 50px 50px; -} -p, li { - width: 625px; -} -a { - color: #191933; -} -h1, h2, h3, h4, h5, h6, b.header { - color: #000; - margin-top: 40px; - margin-bottom: 15px; - text-shadow: #fff 0 1px 1px; -} -h1 { font-size: 18px } -h2 { font-size: 16px } -h3, h4, h5, h6 { font-size: 14px } -.author { font-weight: normal } - -br.clear { - height: 0; - clear: both; -} -ul { - padding-left: 20px; -} -b.header { - display: block; -} -li { - margin-bottom: 10px; -} -code, pre, tt, textarea { - font-family: Monaco, Consolas, "Lucida Console", monospace; - font-size: 12px; - line-height: 18px; - color: #155; - white-space: pre-wrap; - word-wrap: break-word; -} - tt { - display: inline-block; - background: #fff; - border: 1px solid #dedede; - padding: 0px 0.2em; - } - pre { - border-left: 5px solid rgba(0,0,0,0.2); - padding: 3px 0 3px 12px; - font-size: 12px; - } - pre.no_bar { - border-left: 0; - margin-left: 0; - padding-left: 0; - } -div.code { - position: relative; - background: #fff; - border: 1px solid #d8d8d8; - -webkit-box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - -moz-box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - zoom: 1; -} - div.code { - text-transform: none; - position: absolute; - right: 8px; bottom: 8px; - } - div.code { - left: 8px; right: auto; - } - div.code pre, div.code textarea { - float: left; - width: 450px; - background: #fff; - border-left: 1px dotted #d0d0d0; - margin: 10px 0 15px 3px; - padding: 0 0 0 12px; - } - div.code pre:first-child { - border-left: 0; - } diff --git a/doc/manual/galois-whitepaper.cls b/doc/manual/galois-whitepaper.cls deleted file mode 100644 index 50bcbb878a..0000000000 --- a/doc/manual/galois-whitepaper.cls +++ /dev/null @@ -1,143 +0,0 @@ -% This class file defines a style useful for writing whitepapers at Galois. -\ProvidesClass{galois-whitepaper}[2010/11/02 Galois whitepaper class] -\newif\if@xelatex -\@xelatexfalse -\DeclareOption{xelatex}{\@xelatextrue} -\DeclareOption*{% -\PassOptionsToClass{\CurrentOption}{article}% -} -\ProcessOptions\relax -\LoadClass{article} - -% Useful packages: -% \RequirePackage[table]{xcolor} -\RequirePackage{graphicx} -%\RequirePackage[headheight=144pt,top=1in,left=1in,right=1in,bottom=1in]{geometry} -\RequirePackage[top=1in,left=1in,right=1in,bottom=1in]{geometry} -\RequirePackage{lastpage} - -% Define fonts -\if@xelatex -\RequirePackage{fontspec} -\RequirePackage{xunicode} -\RequirePackage{xltxtra} -\setmainfont[Mapping=tex-text]{Palatino} -\setsansfont[Mapping=tex-text]{Arial} -\else -\RequirePackage[sc]{mathpazo} -\linespread{1.05} -\RequirePackage[scaled]{helvet} -\fi - -\RequirePackage{sectsty} -\allsectionsfont{\sffamily} - -\newcommand{\highlight}[2][yellow]{\colorbox{#1}{#2}} - -% Misc defines -\makeatletter - \newcommand{\@companyName}{Galois, Inc.} - \newcommand{\@addressPhone}{ - 421 SW 6th Ave., Ste.\ 300\\ - Portland, OR 97204 - } - \newcommand{\@contact}{\todo[Contact]} - \newcommand{\@subtitle}{TODO: Subtitle} - \newcommand{\contact}[1]{\renewcommand{\@contact}{{#1}}} -\makeatother - -\renewcommand{\paragraph}[1]{\textbf{\sffamily{#1}}\quad} - -% \definecolor{galoislightgray}{gray}{0.6} -% Define header and footer -\RequirePackage{fancyhdr} -\pagestyle{fancy} -\fancyheadoffset[l]{1in} -\fancyheadoffset[r]{0in} -%\fancyheadoffset[r]{0.2in} -\fancyhead{}%clear -\fancyfoot{}%clear -\renewcommand{\headrulewidth}{0pt} -\renewcommand{\footrulewidth}{1pt} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Define a special header style for the title page: -% - \makeatletter - \newcommand{\ps@title}{% - \renewcommand{\@oddhead}{% - \sffamily\includegraphics[width=1in]{figures/galois.pdf} - }} -\makeatother - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Set up the headers/footers for the rest of the document: -%% -\lhead{\sffamily\hspace{1in}\includegraphics[width=1in]{figures/galois.pdf}} - -\makeatletter -\rhead{\sffamily \textit{\@title{}}} -\makeatother - -\lfoot{\sffamily\color{lightgray}\small\copyright{} 2021 Galois, Inc.} - -\rfoot{ - % \sffamily \color{galoislightgray}\small - \sffamily \color{lightgray}\small - Page \thepage{}% of \pageref{LastPage} -} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% The title content definition: -%% -\makeatletter -\DeclareRobustCommand{\maketitle}{ - \thispagestyle{title} - \begin{center} - {\sffamily - {\large \textbf{}}\\ - \vspace{0.3in} - \textbf{\sffamily \LARGE \@title}\\ - %\vspace{0.5in} - %{ \@author }\\ - \vspace{0.3in} - {\large \@companyName }\\ - \vspace{0.025in} - {\small \@addressPhone }\\ - } - \end{center} -} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Add a stop ``.'' after each section number: -%% - -% Redefine the bibliography environment to use \section{\bibname} -% instead of \section*{\bibname} -\newcommand{\bibname}{References} -\renewenvironment{thebibliography}[1] - {\section*{\bibname}% <-- this line was changed from \chapter* to \section* - \@mkboth{\MakeUppercase\bibname}{\MakeUppercase\bibname}% - \list{\@biblabel{\@arabic\c@enumiv}}% - {\settowidth\labelwidth{\@biblabel{#1}}% - \leftmargin\labelwidth - \advance\leftmargin\labelsep - \@openbib@code - \usecounter{enumiv}% - \let\p@enumiv\@empty - \renewcommand\theenumiv{\@arabic\c@enumiv}}% - \sloppy - \clubpenalty4000 - \@clubpenalty \clubpenalty - \widowpenalty4000% - \sfcode`\.\@m} - {\def\@noitemerr - {\@latex@warning{Empty `thebibliography' environment}}% - \endlist} - -\makeatother diff --git a/doc/manual/head.tex b/doc/manual/head.tex deleted file mode 100644 index 42938a2a75..0000000000 --- a/doc/manual/head.tex +++ /dev/null @@ -1,44 +0,0 @@ -\usepackage{listings} -\usepackage{xspace} -\usepackage{color} -\usepackage{tikz} -\usepackage{url} -\usepackage{amsmath} -\usepackage{amscd} -\usepackage{verbatim} -\usepackage{fancyvrb} - -\let\verbatiminput=\verbatimtabinput -\VerbatimFootnotes -\DefineVerbatimEnvironment{code}{Verbatim}{} -\DefineVerbatimEnvironment{pseudoCode}{Verbatim}{} -\renewcommand{\textfraction}{0.05} -\renewcommand{\topfraction}{0.95} -\renewcommand{\bottomfraction}{0.95} -\renewcommand{\floatpagefraction}{0.35} -\definecolor{MyGray}{rgb}{0.9,0.9,0.9} - -\lstset{ - basicstyle=\ttfamily, - numberstyle=\tiny, - numbersep=5pt, - extendedchars=true, - breaklines=true, - showspaces=false, - showtabs=false, - xleftmargin=10pt, - xrightmargin=5pt, - framexleftmargin=5pt, - framexrightmargin=-1pt, - framexbottommargin=4pt, - showstringspaces=false, - % Note that the `1` at the end is required. Per the documentation for - % the `listings` package (see Section 5.4 of - % https://mirrors.mit.edu/CTAN/macros/latex/contrib/listings/listings.pdf), - % the `1` indicates the length of the replacement text (i.e., a single - % character). - literate={ș}{{\textcommabelow{s}}}1, -} - -\author{The Galois SAW Team\\\texttt{saw@galois.com}} -\title{Using the Software Analysis Workbench (SAW)} diff --git a/doc/manual/manual.md b/doc/manual/manual.md deleted file mode 100644 index 16ec4d6ca3..0000000000 --- a/doc/manual/manual.md +++ /dev/null @@ -1,5099 +0,0 @@ -## Overview - -The Software Analysis Workbench (SAW) is a tool for constructing -mathematical models of the computational behavior of software, -transforming these models, and proving properties about them. - -SAW can currently construct models of a subset of programs written in -Cryptol, LLVM (and therefore C), and JVM (and therefore Java). SAW also has -experimental, incomplete support for MIR (and therefore Rust). The -models take the form of typed functional programs, so in a sense SAW can -be considered a translator from imperative programs to their functional -equivalents. Various external proof tools, including a variety of SAT -and SMT solvers, can be used to prove properties about the functional -models. SAW can construct models from arbitrary Cryptol programs, and -from C and Java programs that have fixed-size inputs and outputs and -that terminate after a fixed number of iterations of any loop (or a -fixed number of recursive calls). One common use case is to verify that -an algorithm specification in Cryptol is equivalent to an algorithm -implementation in C or Java. - -The process of extracting models from programs, manipulating them, -forming queries about them, and sending them to external provers is -orchestrated using a special purpose language called SAWScript. -SAWScript is a typed functional language with support for sequencing of -imperative commands. - -The rest of this document first describes how to use the SAW tool, -`saw`, and outlines the structure of the SAWScript language and its -relationship to Cryptol. It then presents the SAWScript commands that -transform functional models and prove properties about them. Finally, it -describes the specific commands available for constructing models from -imperative programs. - -# Invoking SAW - -The primary mechanism for interacting with SAW is through the `saw` -executable included as part of the standard binary distribution. With no -arguments, `saw` starts a read-evaluate-print loop (REPL) that allows -the user to interactively evaluate commands in the SAWScript language. -With one file name argument, it executes the specified file as a SAWScript -program. - -In addition to a file name, the `saw` executable accepts several -command-line options: - -`-h, -?, --help` - - ~ Print a help message. - -`-V, --version` - - ~ Show the version of the SAWScript interpreter. - -`-c path, --classpath=path` - - ~ Specify a colon-delimited list of paths to search for Java classes. - -`-i path, --import-path=path` - - ~ Specify a colon-delimited list of paths to search for imports. - -`-t, --extra-type-checking` - - ~ Perform extra type checking of intermediate values. - -`-I, --interactive` - - ~ Run interactively (with a REPL). This is the default if no other - arguments are specified. - -`-j path, --jars=path` - - ~ Specify a colon-delimited list of paths to `.jar` files to search - for Java classes. - -`-b path, --java-bin-dirs` - - ~ Specify a colon-delimited list of paths to search for a Java - executable. - -`-d num, --sim-verbose=num` - - ~ Set the verbosity level of the Java and LLVM simulators. - -`-v num, --verbose=num` - - ~ Set the verbosity level of the SAWScript interpreter. - -`--clean-mismatched-versions-solver-cache[=path]` - - ~ Run the `clean_mismatched_versions_solver_cache` command on the solver - cache at the given path, or if no path is given, the solver cache at the - value of the `SAW_SOLVER_CACHE_PATH` environment variable, then exit. See - the section **Caching Solver Results** for a description of the - `clean_mismatched_versions_solver_cache` command and the solver caching - feature in general. - -SAW also uses several environment variables for configuration: - -`CRYPTOLPATH` - - ~ Specify a colon-delimited list of directory paths to search for Cryptol - imports (including the Cryptol prelude). - -`PATH` - - ~ If the `--java-bin-dirs` option is not set, then the `PATH` will be - searched to find a Java executable. - -`SAW_IMPORT_PATH` - - ~ Specify a colon-delimited list of directory paths to search for imports. - -`SAW_JDK_JAR` - - ~ Specify the path of the `.jar` file containing the core Java - libraries. Note that that is not necessary if the `--java-bin-dirs` option - or the `PATH` environment variable is used, as SAW can use this information - to determine the location of the core Java libraries' `.jar` file. - -`SAW_SOLVER_CACHE_PATH` - - ~ Specify a path at which to keep a cache of solver results obtained during - calls to certain tactics. A cache is not created at this path until it is - needed. See the section **Caching Solver Results** for more detail about this - feature. - -On Windows, semicolon-delimited lists are used instead of colon-delimited -lists. - -# Structure of SAWScript - -A SAWScript program consists, at the top level, of a sequence of -commands to be executed in order. Each command is terminated with a -semicolon. For example, the `print` command displays a textual -representation of its argument. Suppose the following text is stored in -the file `print.saw`: - -~~~~ -print 3; -~~~~ - -The command `saw print.saw` will then yield output similar to the -following: - -~~~~ -Loading module Cryptol -Loading file "print.saw" -3 -~~~~ - -The same code can be run from the interactive REPL: - -~~~~ -sawscript> print 3; -3 -~~~~ - -At the REPL, terminating semicolons can be omitted: - -~~~~ -sawscript> print 3 -3 -~~~~ - -To make common use cases simpler, bare values at the REPL are treated as -if they were arguments to `print`: - -~~~~ -sawscript> 3 -3 -~~~~ - -One SAWScript file can be included in another using the `include` -command, which takes the name of the file to be included as an argument. -For example: - -~~~~ -sawscript> include "print.saw" -Loading file "print.saw" -3 -~~~~ - -Typically, included files are used to import definitions, not perform -side effects like printing. However, as you can see, if any commands -with side effects occur at the top level of the imported file, those -side effects will occur during import. - -## Syntax - -The syntax of SAWScript is reminiscent of functional languages such as -Cryptol, Haskell and ML. In particular, functions are applied by -writing them next to their arguments rather than by using parentheses -and commas. Rather than writing `f(x, y)`, write `f x y`. - -Comments are written as in C, Java, and Rust (among many other languages). All -text from `//` until the end of a line is ignored. Additionally, all -text between `/*` and `*/` is ignored, regardless of whether the line -ends. - -## Basic Types and Values - -All values in SAWScript have types, and these types are determined and -checked before a program runs (that is, SAWScript is statically typed). -The basic types available are similar to those in many other languages. - -* The `Int` type represents unbounded mathematical integers. Integer - constants can be written in decimal notation (e.g., `42`), hexadecimal - notation (`0x2a`), and binary (`0b00101010`). However, unlike many - languages, integers in SAWScript are used primarily as constants. - Arithmetic is usually encoded in Cryptol, as discussed in the next - section. - -* The Boolean type, `Bool`, contains the values `true` and `false`, like - in many other languages. As with integers, computations on Boolean - values usually occur in Cryptol. - -* Values of any type can be aggregated into tuples. For example, the - value `(true, 10)` has the type `(Bool, Int)`. - -* Values of any type can also be aggregated into records, which are - exactly like tuples except that their components have names. For - example, the value `{ b = true, n = 10 }` has the type `{ b : Bool, - n : Int }`. - -* A sequence of values of the same type can be stored in a list. For - example, the value `[true, false, true]` has the type `[Bool]`. - -* Strings of textual characters can be represented in the `String` type. - For example, the value `"example"` has type `String`. - -* The "unit" type, written `()`, is essentially a placeholder, similar - to `void` in languages like C and Java. It has only one value, also - written `()`. Values of type `()` convey no information. We will show - in later sections several cases where this is useful. - -* Functions are given types that indicate what type they consume and - what type they produce. For example, the type `Int -> Bool` indicates - a function that takes an `Int` as input and produces a `Bool` as - output. Functions with multiple arguments use multiple arrows. For - example, the type `Int -> String -> Bool` indicates a function in - which the first argument is an `Int`, the second is a `String`, and - the result is a `Bool`. It is possible, but not necessary, to group - arguments in tuples, as well, so the type `(Int, String) -> Bool` - describes a function that takes one argument, a pair of an `Int` and a - `String`, and returns a `Bool`. - -SAWScript also includes some more specialized types that do not have -straightforward counterparts in most other languages. These will appear -in later sections. - -## Basic Expression Forms - -One of the key forms of top-level command in SAWScript is a *binding*, -introduced with the `let` keyword, which gives a name to a value. For -example: - -~~~~ -sawscript> let x = 5 -sawscript> x -5 -~~~~ - -Bindings can have parameters, in which case they define functions. For -instance, the following function takes one parameter and constructs a -list containing that parameter as its single element. - -~~~~ -sawscript> let f x = [x] -sawscript> f "text" -["text"] -~~~~ - -Functions themselves are values and have types. The type of a function -that takes an argument of type `a` and returns a result of type `b` is -`a -> b`. - -Function types are typically inferred, as in the example `f` -above. In this case, because `f` only creates a list with the given -argument, and because it is possible to create a list of any element -type, `f` can be applied to an argument of any type. We say, therefore, -that `f` is *polymorphic*. Concretely, we write the type of `f` as `{a} -a -> [a]`, meaning it takes a value of any type (denoted `a`) and -returns a list containing elements of that same type. This means we can -also apply `f` to `10`: - -~~~~ -sawscript> f 10 -[10] -~~~~ - -However, we may want to specify that a function has a more -specific type. In this case, we could restrict `f` to operate only on -`Int` parameters. - -~~~~ -sawscript> let f (x : Int) = [x] -~~~~ - -This will work identically to the original `f` on an `Int` parameter: - -~~~~ -sawscript> f 10 -[10] -~~~~ - -However, it will fail for a `String` parameter: - -~~~~ -sawscript> f "text" - -type mismatch: String -> t.0 and Int -> [Int] - at "_" (REPL) -mismatched type constructors: String and Int -~~~~ - -Type annotations can be applied to any expression. The notation `(e : -t)` indicates that expression `e` is expected to have type `t` and -that it is an error for `e` to have a different type. Most types in -SAWScript are inferred automatically, but specifying them explicitly can -sometimes enhance readability. - -Because functions are values, functions can return other functions. We -make use of this feature when writing functions of multiple arguments. -Consider the function `g`, similar to `f` but with two arguments: - -~~~~ -sawscript> let g x y = [x, y] -~~~~ - -Like `f`, `g` is polymorphic. Its type is `{a} a -> a -> [a]`. This -means it takes an argument of type `a` and returns a *function* that -takes an argument of the same type `a` and returns a list of `a` values. -We can therefore apply `g` to any two arguments of the same type: - -~~~~ -sawscript> g 2 3 -[2,3] -sawscript> g true false -[true,false] -~~~~ - -But type checking will fail if we apply it to two values of different types: - -~~~~ -sawscript> g 2 false - -type mismatch: Bool -> t.0 and Int -> [Int] - at "_" (REPL) -mismatched type constructors: Bool and Int -~~~~ - -So far we have used two related terms, *function* and *command*, and we -take these to mean slightly different things. A function is any value -with a function type (e.g., `Int -> [Int]`). A command is any value with -a special command type (e.g. `TopLevel ()`, as shown below). These -special types allow us to restrict command usage to specific contexts, -and are also *parameterized* (like the list type). Most but not all -commands are also functions. - -The most important command type is the `TopLevel` type, indicating a -command that can run at the top level (directly at the REPL, or as one -of the top level commands in a script file). The `print` command -has the type `{a} a -> TopLevel ()`, where `TopLevel ()` means that it -is a command that runs in the `TopLevel` context and returns a value of -type `()` (that is, no useful information). In other words, it has a -side effect (printing some text to the screen) but doesn't produce any -information to use in the rest of the SAWScript program. This is the -primary usage of the `()` type. - -It can sometimes be useful to bind a sequence of commands together. This -can be accomplished with the `do { ... }` construct. For example: - -~~~~ -sawscript> let print_two = do { print "first"; print "second"; } -sawscript> print_two -first -second -~~~~ - -The bound value, `print_two`, has type `TopLevel ()`, since that is the -type of its last command. - -Note that in the previous example the printing doesn't occur until -`print_two` directly appears at the REPL. The `let` expression does not -cause those commands to run. The construct that *runs* a command is -written using the `<-` operator. This operator works like `let` except -that it says to run the command listed on the right hand side and bind -the result, rather than binding the variable to the command itself. -Using `<-` instead of `let` in the previous example yields: - -~~~~ -sawscript> print_two <- do { print "first"; print "second"; } -first -second -sawscript> print print_two -() -~~~~ - -Here, the `print` commands run first, and then `print_two` gets the -value returned by the second `print` command, namely `()`. Any command -run without using `<-` at either the top level of a script or within a -`do` block discards its result. However, the REPL prints the result of -any command run without using the `<-` operator. - -In some cases it can be useful to have more control over the value -returned by a `do` block. The `return` command allows us to do this. For -example, say we wanted to write a function that would print a message -before and after running some arbitrary command and then return the -result of that command. We could write: - -~~~~ -let run_with_message msg c = - do { - print "Starting."; - print msg; - res <- c; - print "Done."; - return res; - }; - -x <- run_with_message "Hello" (return 3); -print x; -~~~~ - -If we put this script in `run.saw` and run it with `saw`, we get -something like: - -~~~~ -Loading module Cryptol -Loading file "run.saw" -Starting. -Hello -Done. -3 -~~~~ - -Note that it ran the first `print` command, then the caller-specified -command, then the second `print` command. The result stored in `x` at -the end is the result of the `return` command passed in as an argument. - -## Other Basic Functions - -Aside from the functions we have listed so far, there are a number of other -operations for working with basic data structures and interacting -with the operating system. - -The following functions work on lists: - -* `concat : {a} [a] -> [a] -> [a]` takes two lists and returns the -concatenation of the two. - -* `head : {a} [a] -> a` returns the first element of a list. - -* `tail : {a} [a] -> [a]` returns everything except the first element. - -* `length : {a} [a] -> Int` counts the number of elements in a list. - -* `null : {a} [a] -> Bool` indicates whether a list is empty (has zero -elements). - -* `nth : {a} [a] -> Int -> a` returns the element at the given position, -with `nth l 0` being equivalent to `head l`. - -* `for : {m, a, b} [a] -> (a -> m b) -> m [b]` takes a list and a -function that runs in some command context. The passed command will be -called once for every element of the list, in order. Returns a list of -all of the results produced by the command. - -For interacting with the operating system, we have: - -* `get_opt : Int -> String` returns the command-line argument to `saw` -at the given index. Argument 0 is always the name of the `saw` -executable itself, and higher indices represent later arguments. - -* `exec : String -> [String] -> String -> TopLevel String` runs an -external program given, respectively, an executable name, a list of -arguments, and a string to send to the standard input of the program. -The `exec` command returns the standard output from the program it -executes and prints standard error to the screen. - -* `exit : Int -> TopLevel ()` stops execution of the current script and -returns the given exit code to the operating system. - -Finally, there are a few miscellaneous functions and commands: - -* `show : {a} a -> String` computes the textual representation of its -argument in the same way as `print`, but instead of displaying the value -it returns it as a `String` value for later use in the program. This can -be useful for constructing more detailed messages later. - -* `str_concat : String -> String -> String` concatenates two `String` -values, and can also be useful with `show`. - -* `time : {a} TopLevel a -> TopLevel a` runs any other `TopLevel` -command and prints out the time it took to execute. - -* `with_time : {a} TopLevel a -> TopLevel (Int, a)` returns both the -original result of the timed command and the time taken to execute it -(in milliseconds), without printing anything in the process. - -## REPL Actions - -There is an additional class of things that one may type at the REPL -for interactive use: - -* `:cd` changes the REPL's current directory. - -* `:pwd` prints the REPL's current directory. - -* `:env` displays the values and types of all currently bound -variables, including built-in functions and commands. - -* `:tenv` displays the expansions of all currently defined type -aliases, including those that are built in. - -* `:type` or `:t` checks and prints the type of an arbitrary SAWScript -expression: -~~~~ -sawscript> :t show -{a.0} a.0 -> String -~~~~ - -* `:help` or `:h` prints the help text for a built-in function or -command: -~~~~ -sawscript> :h show -Description ------------ - - show : {a} a -> String - -Convert the value of the given expression to a string. -~~~~ - -* `:quit` or `:q` exits the program. - -## Further built-in functions and commands - -SAW contains many built-in operations, referred to as "primitives." -These appear in SAWScript as built-in functions and commands. -The following sections of the manual will introduce many of these. - -## Experimental and deprecated functions and commands - -Some of the primitives available in SAW at any given time are -experimental. -These may be incomplete, unfinished, use at your own risk, etc. -The functions and commands associated with these are unavailable -by default; they can be made visible with the `enable_experimental` -command. - -Other primitives are considered deprecated. -Some of these, as the -[deprecation process]("formal-deprecation-process") proceeds, are -unavailable by default. - -They can be made visible with the `enable_deprecated` command. - -# The Term Type - -Perhaps the most important type in SAWScript, and the one most unlike -the built-in types of most other languages, is the `Term` type. -Essentially, a value of type `Term` precisely describes all -possible computations performed by some program. In particular, if -two `Term` values are *equivalent*, then the programs that they -represent will always compute the same results given the same inputs. We -will say more later about exactly what it means for two terms to be -equivalent, and how to determine whether two terms are equivalent. - -Before exploring the `Term` type more deeply, it is important to -understand the role of the Cryptol language in SAW. - -# Cryptol and its Role in SAW - -Cryptol is a domain-specific language originally designed for the -high-level specification of cryptographic algorithms. It is general -enough, however, to describe a wide variety of programs, and is -particularly applicable to describing computations that operate on -streams of data of some fixed size. - -In addition to being integrated into SAW, Cryptol is a standalone -language with its own manual: - -~~~~ -http://cryptol.net/files/ProgrammingCryptol.pdf -~~~~ - -SAW includes deep support for Cryptol, and in fact requires the use of -Cryptol for most non-trivial tasks. To fully understand the rest of -this manual and to effectively use SAW, you will need to develop at least -a rudimentary understanding of Cryptol. - -The primary use of Cryptol within SAWScript is to construct values of type -`Term`. Although `Term` values can be constructed from various sources, -inline Cryptol expressions are the most direct and convenient way to create -them. - -Specifically, a Cryptol expression can be placed inside double curly -braces (`{{` and `}}`), resulting in a value of type `Term`. As a very -simple example, there is no built-in integer addition operation in -SAWScript. However, we can use Cryptol's built-in integer addition operator -within SAWScript as follows: - -~~~~ -sawscript> let t = {{ 0x22 + 0x33 }} -sawscript> print t -85 -sawscript> :type t -Term -~~~~ - -Although it printed out in the same way as an `Int`, it is important to -note that `t` actually has type `Term`. We can see how this term is -represented internally, before being evaluated, with the `print_term` -function. - -~~~~ -sawscript> print_term t -let { x@1 = Prelude.Vec 8 Prelude.Bool - x@2 = Cryptol.TCNum 8 - x@3 = Cryptol.PLiteralSeqBool x@2 - } - in Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool x@2) - (Cryptol.ecNumber (Cryptol.TCNum 34) x@1 x@3) - (Cryptol.ecNumber (Cryptol.TCNum 51) x@1 x@3) -~~~~ - -For the moment, it's not important to understand what this output means. -We show it only to clarify that `Term` values have their own internal -structure that goes beyond what exists in SAWScript. The internal -representation of `Term` values is in a language called SAWCore. The -full semantics of SAWCore are beyond the scope of this manual. - -The text constructed by `print_term` can also be accessed -programmatically (instead of printing to the screen) using the -`show_term` function, which returns a `String`. The `show_term` function -is not a command, so it executes directly and does not need `<-` to bind -its result. Therefore, the following will have the same result as the -`print_term` command above: - -~~~~ -sawscript> let s = show_term t -sawscript> :type s -String -sawscript> print s - -~~~~ - -Numbers are printed in decimal notation by default when printing terms, -but the following two commands can change that behavior. - -* `set_ascii : Bool -> TopLevel ()`, when passed `true`, makes -subsequent `print_term` or `show_term` commands print sequences of bytes -as ASCII strings (and doesn't affect printing of anything else). - -* `set_base : Int -> TopLevel ()` prints all bit vectors in the given -base, which can be between 2 and 36 (inclusive). - -A `Term` that represents an integer (any bit vector, as affected by -`set_base`) can be translated into a SAWScript `Int` using the -`eval_int : Term -> Int` function. This function returns an -`Int` if the `Term` can be represented as one, and fails at runtime -otherwise. - -~~~~ -sawscript> print (eval_int t) -85 -sawscript> print (eval_int {{ True }}) - -"eval_int" (:1:1): -eval_int: argument is not a finite bitvector -sawscript> print (eval_int {{ [True] }}) -1 -~~~~ - -Similarly, values of type `Bit` in Cryptol can be translated into values -of type `Bool` in SAWScript using the `eval_bool : Term -> Bool` function: - -~~~~ -sawscript> let b = {{ True }} -sawscript> print_term b -Prelude.True -sawscript> print (eval_bool b) -true -~~~~ - -Anything with sequence type in Cryptol can be translated into a list of -`Term` values in SAWScript using the `eval_list : Term -> [Term]` function. - -~~~~ -sawscript> let l = {{ [0x01, 0x02, 0x03] }} -sawscript> print_term l -let { x@1 = Prelude.Vec 8 Prelude.Bool - x@2 = Cryptol.PLiteralSeqBool (Cryptol.TCNum 8) - } - in [Cryptol.ecNumber (Cryptol.TCNum 1) x@1 x@2 - ,Cryptol.ecNumber (Cryptol.TCNum 2) x@1 x@2 - ,Cryptol.ecNumber (Cryptol.TCNum 3) x@1 x@2] -sawscript> print (eval_list l) -[Cryptol.ecNumber (Cryptol.TCNum 1) (Prelude.Vec 8 Prelude.Bool) - (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) -,Cryptol.ecNumber (Cryptol.TCNum 2) (Prelude.Vec 8 Prelude.Bool) - (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) -,Cryptol.ecNumber (Cryptol.TCNum 3) (Prelude.Vec 8 Prelude.Bool) - (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8))] -~~~~ - -Finally, a list of `Term` values in SAWScript can be collapsed into a single -`Term` with sequence type using the `list_term : [Term] -> Term` function, -which is the inverse of `eval_list`. - -~~~~ -sawscript> let ts = eval_list l -sawscript> let l = list_term ts -sawscript> print_term l -let { x@1 = Prelude.Vec 8 Prelude.Bool - x@2 = Cryptol.PLiteralSeqBool (Cryptol.TCNum 8) - } - in [Cryptol.ecNumber (Cryptol.TCNum 1) x@1 x@2 - ,Cryptol.ecNumber (Cryptol.TCNum 2) x@1 x@2 - ,Cryptol.ecNumber (Cryptol.TCNum 3) x@1 x@2] -~~~~ - -In addition to being able to extract integer and Boolean values from -Cryptol expressions, `Term` values can be injected into Cryptol -expressions. When SAWScript evaluates a Cryptol expression between `{{` -and `}}` delimiters, it does so with several extra bindings in scope: - -* Any variable in scope that has SAWScript type `Bool` is visible in - Cryptol expressions as a value of type `Bit`. - -* Any variable in scope that has SAWScript type `Int` is visible in - Cryptol expressions as a *type variable*. Type variables can be - demoted to numeric bit vector values using the backtick (`` ` ``) - operator. - -* Any variable in scope that has SAWScript type `Term` is visible in - Cryptol expressions as a value with the Cryptol type corresponding to - the internal type of the term. The power of this conversion is that - the `Term` does not need to have originally been derived from a - Cryptol expression. - -In addition to these rules, bindings created at the Cryptol level, -either from included files or inside Cryptol quoting brackets, are -visible only to later Cryptol expressions, and not as SAWScript -variables. - -To make these rules more concrete, consider the following examples. If -we bind a SAWScript `Int`, we can use it as a Cryptol type variable. If -we create a `Term` variable that internally has function type, we can -apply it to an argument within a Cryptol expression, but not at the -SAWScript level: - -~~~~ -sawscript> let n = 8 -sawscript> :type n -Int -sawscript> let {{ f (x : [n]) = x + 1 }} -sawscript> :type {{ f }} -Term -sawscript> :type f - -:1:1-1:2: unbound variable: "f" (:1:1-1:2) -sawscript> print {{ f 2 }} -3 -~~~~ - -If `f` was a binding of a SAWScript variable to a `Term` of function -type, we would get a different error: - -~~~~ -sawscript> let f = {{ \(x : [n]) -> x + 1 }} -sawscript> :type {{ f }} -Term -sawscript> :type f -Term -sawscript> print {{ f 2 }} -3 -sawscript> print (f 2) - -type mismatch: Int -> t.0 and Term - at "_" (REPL) - mismatched type constructors: (->) and Term -~~~~ - -One subtlety of dealing with `Term`s constructed from Cryptol is that -because the Cryptol expressions themselves are type checked by the -Cryptol type checker, and because they may make use of other `Term` -values already in scope, they are not type checked until the Cryptol -brackets are evaluated. So type errors at the Cryptol level may occur at -runtime from the SAWScript perspective (though they occur before the -Cryptol expressions are run). - -So far, we have talked about using Cryptol *value* expressions. However, -SAWScript can also work with Cryptol *types*. The most direct way to -refer to a Cryptol type is to use type brackets: `{|` and `|}`. Any -Cryptol type written between these brackets becomes a `Type` value in -SAWScript. Some types in Cryptol are *numeric* (also known as *size*) -types, and correspond to non-negative integers. These can be translated -into SAWScript integers with the `eval_size` function. For example: - -~~~~ -sawscript> let {{ type n = 16 }} -sawscript> eval_size {| n |} -16 -sawscript> eval_size {| 16 |} -16 -~~~~ - -For non-numeric types, `eval_size` fails at runtime: - -~~~~ -sawscript> eval_size {| [16] |} - -"eval_size" (:1:1): -eval_size: not a numeric type -~~~~ - -In addition to the use of brackets to write Cryptol expressions inline, -several built-in functions can extract `Term` values from Cryptol files -in other ways. The `import` command at the top level imports all -top-level definitions from a Cryptol file and places them in scope -within later bracketed expressions. This includes [Cryptol `foreign` -declarations](https://galoisinc.github.io/cryptol/master/FFI.html). If a -[Cryptol implementation of a foreign -function](https://galoisinc.github.io/cryptol/master/FFI.html#cryptol-implementation-of-foreign-functions) -is present, then it will be used as the definition when reasoning about -the function. Otherwise, the function will be imported as an opaque -constant with no definition. - -The `cryptol_load` command behaves similarly, but returns a -`CryptolModule` instead. If any `CryptolModule` is in scope, its -contents are available qualified with the name of the `CryptolModule` -variable. A specific definition can be explicitly extracted from a -`CryptolModule` using the `cryptol_extract` command: - -* `cryptol_extract : CryptolModule -> String -> TopLevel Term` - -# Transforming Term Values - -The three primary functions of SAW are *extracting* models (`Term` -values) from programs, *transforming* those models, and *proving* -properties about models using external provers. So far we've shown how -to construct `Term` values from Cryptol programs; later sections -will describe how to extract them from other programs. Now we show how -to use the various term transformation features available in SAW. - -## Rewriting - -Rewriting a `Term` consists of applying one or more *rewrite rules* to -it, resulting in a new `Term`. A rewrite rule in SAW can be specified in -multiple ways: - - 1. as the definition of a function that can be unfolded, - 1. as a term of Boolean type (or a function returning a Boolean) that - is an equality statement, and - 1. as a term of _equality type_ with a body that encodes a proof that - the equality in the type is valid. - -In each case the term logically consists of two sides and describes a -way to transform the left side into the right side. Each side may -contain variables (bound by enclosing lambda expressions) and is -therefore a *pattern* which can match any term in which each variable -represents an arbitrary sub-term. The left-hand pattern describes a term -to match (which may be a sub-term of the full term being rewritten), and -the right-hand pattern describes a term to replace it with. Any variable -in the right-hand pattern must also appear in the left-hand pattern and -will be instantiated with whatever sub-term matched that variable in the -original term. - -For example, say we have the following Cryptol function: - -~~~~ - \(x:[8]) -> (x * 2) + 1 -~~~~ - -We might for some reason want to replace multiplication by a power of -two with a shift. We can describe this replacement using an equality -statement in Cryptol (a rule of form 2 above): - -~~~~ -\(y:[8]) -> (y * 2) == (y << 1) -~~~~ - -Interpreting this as a rewrite rule, it says that for any 8-bit vector -(call it `y` for now), we can replace `y * 2` with `y << 1`. Using this -rule to rewrite the earlier expression would then yield: - -~~~~ - \(x:[8]) -> (x << 1) + 1 -~~~~ - -The general philosophy of rewriting is that the left and right patterns, -while syntactically different, should be semantically equivalent. -Therefore, applying a set of rewrite rules should not change the -fundamental meaning of the term being rewritten. SAW is particularly -focused on the task of proving that some logical statement expressed as -a `Term` is always true. If that is in fact the case, then the entire -term can be replaced by the term `True` without changing its meaning. The -rewriting process can in some cases, by repeatedly applying rules that -themselves are known to be valid, reduce a complex term entirely to -`True`, which constitutes a proof of the original statement. In other -cases, rewriting can simplify terms before sending them to external -automated provers that can then finish the job. Sometimes this -simplification can help the automated provers run more quickly, and -sometimes it can help them prove things they would otherwise be unable -to prove by applying reasoning steps (rewrite rules) that are not -available to the automated provers. - -In practical use, rewrite rules can be aggregated into `Simpset` -values in SAWScript. A few pre-defined `Simpset` values exist: - -* `empty_ss : Simpset` is the empty set of rules. Rewriting with it -should have no effect, but it is useful as an argument to some of the -functions that construct larger `Simpset` values. - -* `basic_ss : Simpset` is a collection of rules that are useful in most -proof scripts. - -* `cryptol_ss : () -> Simpset` includes a collection of Cryptol-specific -rules. Some of these simplify away the abstractions introduced in the -translation from Cryptol to SAWCore, which can be useful when proving -equivalence between Cryptol and non-Cryptol code. Leaving these -abstractions in place is appropriate when comparing only Cryptol code, -however, so `cryptol_ss` is not included in `basic_ss`. - -The next set of functions can extend or apply a `Simpset`: - -* `addsimp' : Term -> Simpset -> Simpset` adds a single `Term` to an -existing `Simpset. - -* `addsimps' : [Term] -> Simpset -> Simpset` adds a list of `Term`s to -an existing `Simpset`. - -* `rewrite : Simpset -> Term -> Term` applies a `Simpset` to an existing -`Term` to produce a new `Term`. - -To make this more concrete, we examine how the rewriting example -sketched above, to convert multiplication into shift, can work in -practice. We simplify everything with `cryptol_ss` as we go along so -that the `Term`s don't get too cluttered. First, we declare the term -to be transformed: - -~~~~ -sawscript> let term = rewrite (cryptol_ss ()) {{ \(x:[8]) -> (x * 2) + 1 }} -sawscript> print_term term -\(x : Prelude.Vec 8 Prelude.Bool) -> - Prelude.bvAdd 8 (Prelude.bvMul 8 x (Prelude.bvNat 8 2)) - (Prelude.bvNat 8 1) -~~~~ - -Next, we declare the rewrite rule: - -~~~~ -sawscript> let rule = rewrite (cryptol_ss ()) {{ \(y:[8]) -> (y * 2) == (y << 1) }} -sawscript> print_term rule -let { x@1 = Prelude.Vec 8 Prelude.Bool - } - in \(y : x@1) -> - Cryptol.ecEq x@1 (Cryptol.PCmpWord 8) - (Prelude.bvMul 8 y (Prelude.bvNat 8 2)) - (Prelude.bvShiftL 8 Prelude.Bool 1 Prelude.False y - (Prelude.bvNat 1 1)) -~~~~ - -Finally, we apply the rule to the target term: - -~~~~ -sawscript> let result = rewrite (addsimp' rule empty_ss) term -sawscript> print_term result -\(x : Prelude.Vec 8 Prelude.Bool) -> - Prelude.bvAdd 8 - (Prelude.bvShiftL 8 Prelude.Bool 1 Prelude.False x - (Prelude.bvNat 1 1)) - (Prelude.bvNat 8 1) -~~~~ - -Note that `addsimp'` and `addsimps'` take a `Term` or list of `Term`s; -these could in principle be anything, and are not necessarily terms -representing logically valid equalities. They have `'` suffixes because -they are not intended to be the primary interface to rewriting. When using -these functions, the soundness of the proof process depends on the -correctness of these rules as a side condition. - -The primary interface to rewriting uses the `Theorem` type instead of -the `Term` type, as shown in the signatures for `addsimp` and -`addsimps`. - -* `addsimp : Theorem -> Simpset -> Simpset` adds a single `Theorem` to a -`Simpset`. - -* `addsimps : [Theorem] -> Simpset -> Simpset` adds several `Theorem` -values to a `Simpset`. - -A `Theorem` is essentially a `Term` that is proven correct in some way. -In general, a `Theorem` can be any statement, and may not be useful as a -rewrite rule. However, if it has an appropriate shape it can be used for -rewriting. In the ["Proofs about Terms"](#proofs-about-terms) section, -we'll describe how to construct `Theorem` values from `Term` values. - -In the absence of user-constructed `Theorem` values, there are some -additional built-in rules that are not included in either `basic_ss` and -`cryptol_ss` because they are not always beneficial, but that can -sometimes be helpful or essential. The `cryptol_ss` simpset includes -rewrite rules to unfold all definitions in the `Cryptol` SAWCore module, -but does not include any of the terms of equality type. - -* `add_cryptol_defs : `[String] -> Simpset -> Simpset` adds unfolding -rules for functions with the given names from the SAWCore `Cryptol` module -to the given `Simpset`. - -* `add_cryptol_eqs : [String] -> Simpset -> Simpset` adds the terms of -equality type with the given names from the SAWCore `Cryptol` module to -the given `Simpset`. - -* `add_prelude_defs : [String] -> Simpset -> Simpset` adds unfolding -rules from the SAWCore `Prelude` module to a `Simpset`. - -* `add_prelude_eqs : [String] -> Simpset -> Simpset` adds equality-typed -terms from the SAWCore `Prelude` module to a `Simpset`. - -Finally, it's possible to construct a theorem from an arbitrary SAWCore -expression (rather than a Cryptol expression), using the `core_axiom` -function. - -* `core_axiom : String -> Theorem` creates a `Theorem` from a `String` -in SAWCore syntax. Any `Theorem` introduced by this function is assumed -to be correct, so use it with caution. - -## Folding and Unfolding - -A SAWCore term can be given a name using the `define` function, and is -then by default printed as that name alone. A named subterm can be -"unfolded" so that the original definition appears again. - -* `define : String -> Term -> TopLevel Term` - -* `unfold_term : [String] -> Term -> Term` - -For example: - -~~~~ -sawscript> let t = {{ 0x22 }} -sawscript> print_term t -Cryptol.ecNumber (Cryptol.TCNum 34) (Prelude.Vec 8 Prelude.Bool) - (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) -sawscript> t' <- define "t" t -sawscript> print_term t' -t -sawscript> let t'' = unfold_term ["t"] t' -sawscript> print_term t'' -Cryptol.ecNumber (Cryptol.TCNum 34) (Prelude.Vec 8 Prelude.Bool) - (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) -~~~~ - -This process of folding and unfolding is useful both to make large terms -easier for humans to work with and to make automated proofs more -tractable. We'll describe the latter in more detail when we discuss -interacting with external provers. - -In some cases, folding happens automatically when constructing Cryptol -expressions. Consider the following example: - -~~~~ -sawscript> let t = {{ 0x22 }} -sawscript> print_term t -Cryptol.ecNumber (Cryptol.TCNum 34) (Prelude.Vec 8 Prelude.Bool) - (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) -sawscript> let {{ t' = 0x22 }} -sawscript> print_term {{ t' }} -t' -~~~~ - -This illustrates that a bare expression in Cryptol braces gets -translated directly to a SAWCore term. However, a Cryptol *definition* -gets translated into a *folded* SAWCore term. In addition, because the -second definition of `t` occurs at the Cryptol level, rather than the -SAWScript level, it is visible only inside Cryptol braces. Definitions -imported from Cryptol source files are also initially folded and can be -unfolded as needed. - -## Other Built-in Transformation and Inspection Functions - -In addition to the `Term` transformation functions described so far, a -variety of others also exist. - -* `beta_reduce_term : Term -> Term` takes any sub-expression of the form -`(\x -> t) v` in the given `Term` and replaces it with a transformed -version of `t` in which all instances of `x` are replaced by `v`. - -* `replace : Term -> Term -> Term -> TopLevel Term` replaces arbitrary -subterms. A call to `replace x y t` replaces any instance of `x` inside -`t` with `y`. - -Assessing the size of a term can be particularly useful during benchmarking. -SAWScript provides two mechanisms for this. - -* `term_size : Term -> Int` calculates the number of nodes in the -Directed Acyclic Graph (DAG) representation of a `Term` used internally -by SAW. This is the most appropriate way of determining the resource use -of a particular term. - -* `term_tree_size : Term -> Int` calculates how large a `Term` would be -if it were represented by a tree instead of a DAG. This can, in general, -be much, much larger than the number returned by `term_size`, and serves -primarily as a way of assessing, for a specific term, how much benefit -there is to the term sharing used by the DAG representation. - -Finally, there are a few commands related to the internal SAWCore type of a -`Term`. - -* `check_term : Term -> TopLevel ()` checks that the internal structure -of a `Term` is well-formed and that it passes all of the rules of the -SAWCore type checker. - -* `type : Term -> Type` returns the type of a particular `Term`, which -can then be used to, for example, construct a new fresh variable with -`fresh_symbolic`. - -## Loading and Storing Terms - -Most frequently, `Term` values in SAWScript come from Cryptol, JVM, or -LLVM programs, or some transformation thereof. However, it is also -possible to obtain them from various other sources. - -* `parse_core : String -> Term` parses a `String` containing a term in -SAWCore syntax, returning a `Term`. - -* `read_core : String -> TopLevel Term` is like `parse_core`, but -obtains the text from the given file and expects it to be in the simpler -SAWCore external representation format, rather than the human-readable -syntax shown so far. - -* `read_aig : String -> TopLevel Term` returns a `Term` representation -of an And-Inverter-Graph (AIG) file in AIGER format. - -* `read_bytes : String -> TopLevel Term` reads a constant sequence of -bytes from a file and represents it as a `Term`. Its result will always -have Cryptol type `[n][8]` for some `n`. - -It is also possible to write `Term` values into files in various -formats, including: AIGER (`write_aig`), CNF (`write_cnf`), SAWCore -external representation (`write_core`), and SMT-Lib version 2 -(`write_smtlib2`). - -* `write_aig : String -> Term -> TopLevel ()` - -* `write_cnf : String -> Term -> TopLevel ()` - -* `write_core : String -> Term -> TopLevel ()` - -* `write_smtlib2 : String -> Term -> TopLevel ()` - -# Proofs about Terms - -The goal of SAW is to facilitate proofs about the behavior of -programs. It may be useful to prove some small fact to use as a rewrite -rule in later proofs, but ultimately these rewrite rules come together -into a proof of some higher-level property about a software system. - -Whether proving small lemmas (in the form of rewrite rules) or a -top-level theorem, the process builds on the idea of a *proof script* -that is run by one of the top level proof commands. - -* `prove_print : ProofScript () -> Term -> TopLevel Theorem` -takes a proof script (which we'll describe next) and a `Term`. The -`Term` should be of function type with a return value of `Bool` (`Bit` -at the Cryptol level). It will then use the proof script to attempt to -show that the `Term` returns `True` for all possible inputs. If it is -successful, it will print `Valid` and return a `Theorem`. If not, it -will abort. - -* `sat_print : ProofScript () -> Term -> TopLevel ()` is similar -except that it looks for a *single* value for which the `Term` evaluates -to `True` and prints out that value, returning nothing. - -* `prove_core : ProofScript () -> String -> TopLevel Theorem` -proves and returns a `Theorem` from a string in SAWCore syntax. - -## Automated Tactics - -The simplest proof scripts just specify the automated prover to use. -The `ProofScript` values `abc` and `z3` select the ABC and Z3 theorem -provers, respectively, and are typically good choices. - -For example, combining `prove_print` with `abc`: - -~~~~ -sawscript> t <- prove_print abc {{ \(x:[8]) -> x+x == x*2 }} -Valid -sawscript> t -Theorem (let { x@1 = Prelude.Vec 8 Prelude.Bool - x@2 = Cryptol.TCNum 8 - x@3 = Cryptol.PArithSeqBool x@2 - } - in (x : x@1) --> Prelude.EqTrue - (Cryptol.ecEq x@1 (Cryptol.PCmpSeqBool x@2) - (Cryptol.ecPlus x@1 x@3 x x) - (Cryptol.ecMul x@1 x@3 x - (Cryptol.ecNumber (Cryptol.TCNum 2) x@1 - (Cryptol.PLiteralSeqBool x@2))))) -~~~~ - -Similarly, `sat_print` will show that the function returns `True` for -one specific input (which it should, since we already know it returns -`True` for all inputs): - -~~~~ -sawscript> sat_print abc {{ \(x:[8]) -> x+x == x*2 }} -Sat: [x = 0] -~~~~ - -In addition to these, the `bitwuzla`, `boolector`, `cvc4`, `cvc5`, `mathsat`, -and `yices` provers are available. The internal decision procedure `rme`, short -for Reed-Muller Expansion, is an automated prover that works particularly well -on the Galois field operations that show up, for example, in AES. - -In more complex cases, some pre-processing can be helpful or necessary -before handing the problem off to an automated prover. The -pre-processing can involve rewriting, beta reduction, unfolding, the use -of provers that require slightly more configuration, or the use of -provers that do very little real work. - -## Proof Script Diagnostics - -During development of a proof, it can be useful to print various -information about the current goal. The following tactics are useful in -that context. - -* `print_goal : ProofScript ()` prints the entire goal in SAWCore -syntax. - -* `print_goal_consts : ProofScript ()` prints a list of unfoldable constants -in the current goal. - -* `print_goal_depth : Int -> ProofScript ()` takes an integer argument, `n`, -and prints the goal up to depth `n`. Any elided subterms are printed -with a `...` notation. - -* `print_goal_size : ProofScript ()` prints the number of nodes in the -DAG representation of the goal. - -## Rewriting in Proof Scripts - -One of the key techniques available for completing proofs in SAWScript -is the use of rewriting or transformation. The following commands -support this approach. - -* `simplify : Simpset -> ProofScript ()` works just like `rewrite`, -except that it works in a `ProofScript` context and implicitly -transforms the current (unnamed) goal rather than taking a `Term` as a -parameter. - -* `goal_eval : ProofScript ()` will evaluate the current proof goal to a -first-order combination of primitives. - -* `goal_eval_unint : [String] -> ProofScript ()` works like `goal_eval` -but avoids expanding or simplifying the given names. - -## Other Transformations - -Some useful transformations are not easily specified using equality -statements, and instead have special tactics. - -* `beta_reduce_goal : ProofScript ()` works like `beta_reduce_term` but -on the current goal. It takes any sub-expression of the form `(\x -> t) -v` and replaces it with a transformed version of `t` in which all -instances of `x` are replaced by `v`. - -* `unfolding : [String] -> ProofScript ()` works like `unfold_term` but -on the current goal. - -Using `unfolding` is mostly valuable for proofs -based entirely on rewriting, since the default behavior for automated -provers is to unfold everything before sending a goal to a prover. -However, with some provers it is possible to indicate that specific -named subterms should be represented as uninterpreted functions. - -* `unint_bitwuzla : [String] -> ProofScript ()` - -* `unint_cvc4 : [String] -> ProofScript ()` - -* `unint_cvc5 : [String] -> ProofScript ()` - -* `unint_yices : [String] -> ProofScript ()` - -* `unint_z3 : [String] -> ProofScript ()` - -The list of `String` arguments in these cases indicates the names of the -subterms to leave folded, and therefore present as uninterpreted -functions to the prover. To determine which folded constants appear in a -goal, use the `print_goal_consts` function described above. - -Ultimately, we plan to implement a more generic tactic that leaves -certain constants uninterpreted in whatever prover is ultimately used -(provided that uninterpreted functions are expressible in the prover). - -Note that each of the `unint_*` tactics have variants that are prefixed -with `sbv_` and `w4_`. The `sbv_`-prefixed tactics make use of the SBV -library to represent and solve SMT queries: - -* `sbv_unint_bitwuzla : [String] -> ProofScript ()` - -* `sbv_unint_cvc4 : [String] -> ProofScript ()` - -* `sbv_unint_cvc5 : [String] -> ProofScript ()` - -* `sbv_unint_yices : [String] -> ProofScript ()` - -* `sbv_unint_z3 : [String] -> ProofScript ()` - -The `w4_`-prefixed tactics make use of the What4 library instead of SBV: - -* `w4_unint_bitwuzla : [String] -> ProofScript ()` - -* `w4_unint_cvc4 : [String] -> ProofScript ()` - -* `w4_unint_cvc5 : [String] -> ProofScript ()` - -* `w4_unint_yices : [String] -> ProofScript ()` - -* `w4_unint_z3 : [String] -> ProofScript ()` - -In most specifications, the choice of SBV versus What4 is not important, as -both libraries are broadly compatible in terms of functionality. There are some -situations where one library may outpeform the other, however, due to -differences in how each library represents certain SMT queries. There are also -some experimental features that are only supported with What4 at the moment, -such as `enable_lax_loads_and_stores`. - -## Caching Solver Results - -SAW has the capability to cache the results of tactics which call out to -automated provers. This can save a considerable amount of time in cases such as -proof development and CI, where the same proof scripts are often run repeatedly -without changes. - -This caching is available for all tactics which call out to automated provers -at runtime: `abc`, `boolector`, `cvc4`, `cvc5`, `mathsat`, `yices`, `z3`, -`rme`, and the family of `unint` tactics described in the previous section. - -When solver caching is enabled and one of the tactics mentioned above is -encountered, if there is already an entry in the cache corresponding to the -call then the cached result is used, otherwise the appropriate solver is -queried, and the result saved to the cache. Entries are indexed by a SHA256 -hash of the exact query to the solver (ignoring variable names), any options -passed to the solver, and the names and full version strings of all the solver -backends involved (e.g. ABC and SBV for the `abc` tactic). This ensures cached -results are only used when they would be identical to the result of actually -running the tactic. - -The simplest way to enable solver caching is to set the environment variable -`SAW_SOLVER_CACHE_PATH`. With this environment variable set, `saw` and -`saw-remote-api` will automatically keep an [LMDB](http://www.lmdb.tech/doc/) -database at the given path containing the solver result cache. Setting this -environment variable globally therefore creates a global, concurrency-safe -solver result cache used by all newly created `saw` or `saw-remote-api` -processes. Note that when this environment variable is set, SAW does not create -a cache at the specified path until it is actually needed. - -There are also a number of SAW commands related to solver caching. - -* `set_solver_cache_path` is like setting `SAW_SOLVER_CACHE_PATH` for the - remainder of the current session, but opens an LMDB database at the specified - path immediately. If a cache is already in use in the current session - (i.e. through a prior call to `set_solver_cache_path` or through - `SAW_SOLVER_CACHE_PATH` being set and the cache being used at least once) - then all entries in the cache already in use will be copied to the new cache - being opened. - -* `clean_mismatched_versions_solver_cache` will remove all entries in the - solver result cache which were created using solver backend versions which do - not match the versions in the current environment. This can be run after an - update to clear out any old, unusable entries from the solver cache. This - command can also be run directly from the command line through the - `--clean-mismatched-versions-solver-cache` command-line option. - -* `print_solver_cache` prints to the console all entries in the cache whose - SHA256 hash keys start with the given hex string. Providing an empty string - results in all entries in the cache being printed. - -* `print_solver_cache_stats` prints to the console statistics including the - size of the solver cache, where on disk it is stored, and some counts of how - often it has been used during the current session. - -For performing more complicated database operations on the set of cached -results, the file `solver_cache.py` is provided with the Python bindings of the -SAW Remote API. This file implements a general-purpose Python interface for -interacting with the LMDB databases kept by SAW for solver caching. - -Below is an example of using solver caching with `saw -v Debug`. Only the -relevant output is shown, the rest abbreviated with "...". - -~~~~ -sawscript> set_solver_cache_path "example.cache" -sawscript> prove_print z3 {{ \(x:[8]) -> x+x == x*2 }} -[22:13:00.832] Caching result: d1f5a76e7a0b7c01 (SBV 9.2, Z3 4.8.7 - 64 bit) -... -sawscript> prove_print z3 {{ \(new:[8]) -> new+new == new*2 }} -[22:13:04.122] Using cached result: d1f5a76e7a0b7c01 (SBV 9.2, Z3 4.8.7 - 64 bit) -... -sawscript> prove_print (w4_unint_z3_using "qfnia" []) \ - {{ \(x:[8]) -> x+x == x*2 }} -[22:13:09.484] Caching result: 4ee451f8429c2dfe (What4 v1.3-29-g6c462cd using qfnia, Z3 4.8.7 - 64 bit) -... -sawscript> print_solver_cache "d1f5a76e7a0b7c01" -[22:13:13.250] SHA: d1f5a76e7a0b7c01bdfe7d0e1be82b4f233a805ae85a287d45933ed12a54d3eb -[22:13:13.250] - Result: unsat -[22:13:13.250] - Solver: "SBV->Z3" -[22:13:13.250] - Versions: SBV 9.2, Z3 4.8.7 - 64 bit -[22:13:13.250] - Last used: 2023-07-25 22:13:04.120351 UTC - -sawscript> print_solver_cache "4ee451f8429c2dfe" -[22:13:16.727] SHA: 4ee451f8429c2dfefecb6216162bd33cf053f8e66a3b41833193529449ef5752 -[22:13:16.727] - Result: unsat -[22:13:16.727] - Solver: "W4 ->z3" -[22:13:16.727] - Versions: What4 v1.3-29-g6c462cd using qfnia, Z3 4.8.7 - 64 bit -[22:13:16.727] - Last used: 2023-07-25 22:13:09.484464 UTC - -sawscript> print_solver_cache_stats -[22:13:20.585] == Solver result cache statistics == -[22:13:20.585] - 2 results cached in example.cache -[22:13:20.585] - 2 insertions into the cache so far this run (0 failed attempts) -[22:13:20.585] - 1 usage of cached results so far this run (0 failed attempts) -~~~~ - -## Other External Provers - -In addition to the built-in automated provers already discussed, SAW -supports more generic interfaces to other arbitrary theorem provers -supporting specific interfaces. - -* `external_aig_solver : String -> [String] -> ProofScript ()` -supports theorem provers that can take input as a single-output AIGER -file. The first argument is the name of the executable to run. The -second argument is the list of command-line parameters to pass to that -executable. Any element of this list equal to `"%f"` will be replaced -with the name of the temporary AIGER file generated for the proof goal. -The output from the solver is expected to be in DIMACS solution format. - -* `external_cnf_solver : String -> [String] -> ProofScript ()` -works similarly but for SAT solvers that take input in DIMACS CNF format -and produce output in DIMACS solution format. - -## Offline Provers - -For provers that must be invoked in more complex ways, or to defer proof -until a later time, there are functions to write the current goal to a -file in various formats, and then assume that the goal is valid through -the rest of the script. - -* `offline_aig : String -> ProofScript ()` - -* `offline_cnf : String -> ProofScript ()` - -* `offline_extcore : String -> ProofScript ()` - -* `offline_smtlib2 : String -> ProofScript ()` - -* `offline_unint_smtlib2 : [String] -> String -> ProofScript ()` - -These support the AIGER, DIMACS CNF, shared SAWCore, and SMT-Lib v2 -formats, respectively. The shared representation for SAWCore is -described [in the `saw-script` -repository](https://github.com/GaloisInc/saw-script/blob/master/doc/extcore.md). -The `offline_unint_smtlib2` command represents the folded subterms -listed in its first argument as uninterpreted functions. - -## Finishing Proofs without External Solvers - -Some proofs can be completed using unsound placeholders, or using -techniques that do not require significant computation. - -* `assume_unsat : ProofScript ()` indicates that the current goal -should be assumed to be unsatisfiable. This is an alias for -`assume_valid`. Users should prefer to use `admit` instead. - -* `assume_valid : ProofScript ()` indicates that the current -goal should be assumed to be valid. Users should prefer to -use `admit` instead - -* `admit : String -> ProofScript ()` indicates that the current -goal should be assumed to be valid without proof. The given -string should be used to record why the user has decided to -assume this proof goal. - -* `quickcheck : Int -> ProofScript ()` runs the goal on the given -number of random inputs, and succeeds if the result of evaluation is -always `True`. This is unsound, but can be helpful during proof -development, or as a way to provide some evidence for the validity of a -specification believed to be true but difficult or infeasible to prove. - -* `trivial : ProofScript ()` states that the current goal should -be trivially true. This tactic recognizes instances of equality -that can be demonstrated by conversion alone. In particular -it is able to prove `EqTrue x` goals where `x` reduces to -the constant value `True`. It fails if this is not the case. - -## Multiple Goals - -The proof scripts shown so far all have a single implicit goal. As in -many other interactive provers, however, SAWScript proofs can have -multiple goals. The following commands can introduce or work with -multiple goals. These are experimental and can be used only after -`enable_experimental` has been called. - -* `goal_apply : Theorem -> ProofScript ()` will apply a given -introduction rule to the current goal. This will result in zero or more -new subgoals. - -* `goal_assume : ProofScript Theorem` will convert the first hypothesis -in the current proof goal into a local `Theorem` - -* `goal_insert : Theorem -> ProofScript ()` will insert a given -`Theorem` as a new hypothesis in the current proof goal. - -* `goal_intro : String -> ProofScript Term` will introduce a quantified -variable in the current proof goal, returning the variable as a `Term`. - -* `goal_when : String -> ProofScript () -> ProofScript ()` will run the -given proof script only when the goal name contains the given string. - -* `goal_exact : Term -> ProofScript ()` will attempt to use the given -term as an exact proof for the current goal. This tactic will succeed -whever the type of the given term exactly matches the current goal, -and will fail otherwise. - -* `split_goal : ProofScript ()` will split a goal of the form -`Prelude.and prop1 prop2` into two separate goals `prop1` and `prop2`. - -## Proof Failure and Satisfying Assignments - -The `prove_print` and `sat_print` commands print out their essential -results (potentially returning a `Theorem` in the case of -`prove_print`). In some cases, though, one may want to act -programmatically on the result of a proof rather than displaying it. - -The `prove` and `sat` commands allow this sort of programmatic analysis -of proof results. To allow this, they use two types we haven't mentioned -yet: `ProofResult` and `SatResult`. These are different from the other -types in SAWScript because they encode the possibility of two outcomes. -In the case of `ProofResult`, a statement may be valid or there may be a -counter-example. In the case of `SatResult`, there may be a satisfying -assignment or the statement may be unsatisfiable. - -* `prove : ProofScript SatResult -> Term -> TopLevel ProofResult` - -* `sat : ProofScript SatResult -> Term -> TopLevel SatResult` - -To operate on these new types, SAWScript includes a pair of functions: - -* `caseProofResult : {b} ProofResult -> b -> (Term -> b) -> b` takes a -`ProofResult`, a value to return in the case that the statement is -valid, and a function to run on the counter-example, if there is one. - -* `caseSatResult : {b} SatResult -> b -> (Term -> b) -> b` has the same -shape: it returns its first argument if the result represents an -unsatisfiable statement, or its second argument applied to a satisfying -assignment if it finds one. - -## AIG Values and Proofs - -Most SAWScript programs operate on `Term` values, and in most cases this -is the appropriate representation. It is possible, however, to represent -the same function that a `Term` may represent using a different data -structure: an And-Inverter-Graph (AIG). An AIG is a representation of a -Boolean function as a circuit composed entirely of AND gates and -inverters. Hardware synthesis and verification tools, including the ABC -tool that SAW has built in, can do efficient verification and -particularly equivalence checking on AIGs. - -To take advantage of this capability, a handful of built-in commands can -operate on AIGs. - -* `bitblast : Term -> TopLevel AIG` represents a `Term` as an `AIG` by -"blasting" all of its primitive operations (things like bit-vector -addition) down to the level of individual bits. - -* `load_aig : String -> TopLevel AIG` loads an `AIG` from an external -AIGER file. - -* `save_aig : String -> AIG -> TopLevel ()` saves an `AIG` to an -external AIGER file. - -* `save_aig_as_cnf : String -> AIG -> TopLevel ()` writes an `AIG` out -in CNF format for input into a standard SAT solver. - -# Symbolic Execution - -Analysis of Java and LLVM within SAWScript relies heavily on *symbolic -execution*, so some background on how this process works can help with -understanding the behavior of the available built-in functions. - -At the most abstract level, symbolic execution works like normal program -execution except that the values of all variables within the program can be -arbitrary *expressions*, potentially containing free variables, rather than -concrete values. Therefore, each symbolic execution corresponds to some set -of possible concrete executions. - -As a concrete example, consider the following C program that returns -the maximum of two values: - -~~~~ {.c} -unsigned int max(unsigned int x, unsigned int y) { - if (y > x) { - return y; - } else { - return x; - } -} -~~~~ - -If you call this function with two concrete inputs, like this: - -~~~~ {.c} -int r = max(5, 4); -~~~~ - -then it will assign the value `5` to `r`. However, we can also consider what -it will do for *arbitrary* inputs. Consider the following example: - -~~~~ {.c} -int r = max(a, b); -~~~~ - -where `a` and `b` are variables with unknown values. It is still -possible to describe the result of the `max` function in terms of `a` -and `b`. The following expression describes the value of `r`: - -~~~~ -ite (b > a) b a -~~~~ - -where `ite` is the "if-then-else" mathematical function, which based on -the value of the first argument returns either the second or third. One -subtlety of constructing this expression, however, is the treatment of -conditionals in the original program. For any concrete values of `a` and -`b`, only one branch of the `if` statement will execute. During symbolic -execution, on the other hand, it is necessary to execute *both* -branches, track two different program states (each composed of symbolic -values), and then *merge* those states after executing the `if` -statement. This merging process takes into account the original branch -condition and introduces the `ite` expression. - -A symbolic execution system, then, is very similar to an interpreter -that has a different notion of what constitutes a value and executes -*all* paths through the program instead of just one. Therefore, the -execution process is similar to that of a normal -interpreter, and the process of generating a model for a piece of code -is similar to building a test harness for that same code. - -More specifically, the setup process for a test harness typically takes -the following form: - -* Initialize or allocate any resources needed by the code. For Java and - LLVM code, this typically means allocating memory and setting the - initial values of variables. - -* Execute the code. - -* Check the desired properties of the system state after the code - completes. - -Accordingly, three pieces of information are particularly relevant to -the symbolic execution process, and are therefore needed as input to the -symbolic execution system: - -* The initial (potentially symbolic) state of the system. - -* The code to execute. - -* The final state of the system, and which parts of it are relevant to - the properties being tested. - -In the following sections, we describe how the Java and LLVM analysis -primitives work in the context of these key concepts. We start with the -simplest situation, in which the structure of the initial and final -states can be directly inferred, and move on to more complex cases that -require more information from the user. - -# Symbolic Termination - -Above we described the process of executing multiple branches and -merging the results when encountering a conditional statement in the -program. When a program contains loops, the branch that chooses to -continue or terminate a loop could go either way. Therefore, without a -bit more information, the most obvious implementation of symbolic -execution would never terminate when executing programs that contain -loops. - -The solution to this problem is to analyze the branch condition whenever -considering multiple branches. If the condition for one branch can never -be true in the context of the current symbolic state, there is no reason -to execute that branch, and skipping it can make it possible for -symbolic execution to terminate. - -Directly comparing the branch condition to a constant can sometimes be -enough to ensure termination. For example, in simple, bounded loops like -the following, comparison with a constant is sufficient. - -~~~~ {.c} -for (int i = 0; i < 10; i++) { - // do something -} -~~~~ - -In this case, the value of `i` is always concrete, and will eventually -reach the value `10`, at which point the branch corresponding to -continuing the loop will be infeasible. - -As a more complex example, consider the following function: - -~~~~ {.c} -uint8_t f(uint8_t i) { - int done = 0; - while (!done) { - if (i % 8 == 0) done = 1; - i += 5; - } - return i; -} -~~~~ - -The loop in this function can only be determined to symbolically -terminate if the analysis takes into account algebraic rules about -common multiples. Similarly, it can be difficult to prove that a base -case is eventually reached for all inputs to a recursive program. - -In this particular case, however, the code *is* guaranteed to terminate -after a fixed number of iterations (where the number of possible -iterations is a function of the number of bits in the integers being -used). To show that the last iteration is in fact the last possible one, -it's necessary to do more than just compare the branch condition with a -constant. Instead, we can use the same proof tools that we use to -ultimately analyze the generated models to, early in the process, prove -that certain branch conditions can never be true (i.e., are -*unsatisfiable*). - -Normally, most of the Java and LLVM analysis commands simply compare -branch conditions to the constant `True` or `False` to determine whether -a branch may be feasible. However, each form of analysis allows branch -satisfiability checking to be turned on if needed, in which case -functions like `f` above will terminate. - -Next, we examine the details of the specific commands available to -analyze JVM and LLVM programs. - -# Loading Code - -The first step in analyzing any code is to load it into the system. - -## Loading LLVM - -To load LLVM code, simply provide the location of a valid bitcode file -to the `llvm_load_module` function. - -* `llvm_load_module : String -> TopLevel LLVMModule` - -The resulting `LLVMModule` can be passed into the various functions -described below to perform analysis of specific LLVM functions. - -The LLVM bitcode parser should generally work with LLVM versions between -3.5 and 16.0, though it may be incomplete for some versions. Debug -metadata has changed somewhat throughout that version range, so is the -most likely case of incompleteness. We aim to support every version -after 3.5, however, so report any parsing failures as [on -GitHub](https://github.com/GaloisInc/saw-script/issues). - -## Loading Java - -Loading Java code is slightly more complex, because of the more -structured nature of Java packages. First, when running `saw`, three flags -control where to look for classes: - -* The `-b` flag takes the path where the `java` executable lives, which is used - to locate the Java standard library classes and add them to the class - database. Alternatively, one can put the directory where `java` lives on the - `PATH`, which SAW will search if `-b` is not set. - -* The `-j` flag takes the name of a JAR file as an argument and adds the - contents of that file to the class database. - -* The `-c` flag takes the name of a directory as an argument and adds all class - files found in that directory (and its subdirectories) to the class database. - By default, the current directory is included in the class path. - -Most Java programs will only require setting the `-b` flag (or the `PATH`), as -that is enough to bring in the standard Java libraries. Note that when -searching the `PATH`, SAW makes assumptions about where the standard library -classes live. These assumptions are likely to hold on JDK 7 or later, but they -may not hold on older JDKs on certain operating systems. If you are using an -old version of the JDK and SAW is unable to find a standard Java class, you may -need to specify the location of the standard classes' JAR file with the `-j` -flag (or, alternatively, with the `SAW_JDK_JAR` environment variable). - -Once the class path is configured, you can pass the name of a class to -the `java_load_class` function. - -* `java_load_class : String -> TopLevel JavaClass` - -The resulting `JavaClass` can be passed into the various functions -described below to perform analysis of specific Java methods. - -Java class files from any JDK newer than version 6 should work. However, -support for JDK 9 and later is experimental. Verifying code that only uses -primitive data types is known to work well, but there are some as-of-yet -unresolved issues in verifying code involving classes such as `String`. For -more information on these issues, refer to -[this GitHub issue](https://github.com/GaloisInc/crucible/issues/641). - -## Loading MIR - -To load a piece of Rust code, first compile it to a MIR JSON file, as described -in [this section](#compiling-mir), and then provide the location of the JSON -file to the `mir_load_module` function: - -* `mir_load_module : String -> TopLevel MIRModule` - -SAW currently supports Rust code that can be built with a [January 23, 2023 -Rust nightly](https://static.rust-lang.org/dist/2023-01-23/). If you encounter -a Rust feature that SAW does not support, please report it [on -GitHub](https://github.com/GaloisInc/saw-script/issues). - -## Notes on Compiling Code for SAW - -SAW will generally be able to load arbitrary LLVM bitcode, JVM bytecode, and -MIR JSON files, but several guidelines can help make verification easier or -more likely to succeed. - -### Compiling LLVM - -For generating LLVM with `clang`, it can be helpful to: - -* Turn on debugging symbols with `-g` so that SAW can find source - locations of functions, names of variables, etc. - -* Optimize with `-O1` so that the generated bitcode more closely matches - the C/C++ source, making the results more comprehensible. - -* Use `-fno-threadsafe-statics` to prevent `clang` from emitting - unnecessary pthread code. - -* Link all relevant bitcode with `llvm-link` (including, *e.g.*, the C++ - standard library when analyzing C++ code). - -All SAW proofs include side conditions to rule out undefined behavior, -and proofs will only succeed if all of these side conditions have been -discharged. However the default SAW notion of undefined behavior is with -respect to the semantics of LLVM, rather than C or C++. If you want to -rule out undefined behavior according to the C or C++ standards, -consider compiling your code with `-fsanitize=undefined` or one of the -related flags[^1] to `clang`. - -Generally, you'll also want to use `-fsanitize-trap=undefined`, or one -of the related flags, to cause the compiled code to use `llvm.trap` to -indicate the presence of undefined behavior. Otherwise, the compiled -code will call a separate function, such as -`__ubsan_handle_shift_out_of_bounds`, for each type of undefined -behavior, and SAW currently does not have built in support for these -functions (though you could manually create overrides for them in a -verification script). - -[^1]: https://clang.llvm.org/docs/UsersManual.html#controlling-code-generation - -### Compiling Java - -For Java, the only compilation flag that tends to be valuable is `-g` to -retain information about the names of function arguments and local -variables. - -### Compiling MIR - -In order to verify Rust code, SAW analyzes Rust's MIR (mid-level intermediate -representation) language. In particular, SAW analyzes a particular form of MIR -that the [`mir-json`](https://github.com/GaloisInc/mir-json) tool produces. You -will need to intall `mir-json` and run it on Rust code in order to produce MIR -JSON files that SAW can load (see [this section](#loading-mir)). You will also -need to use `mir-json` to build custom versions of the Rust standard libraries -that are more suited to verification purposes. - -If you are working from a checkout of the `saw-script` repo, you can install -the `mir-json` tool and the custom Rust standard libraries by performing the -following steps: - -1. Clone the [`crucible`](https://github.com/GaloisInc/crucible) and `mir-json` - submodules like so: - - ``` - $ git submodule update deps/crucible deps/mir-json - ``` - -2. Navigate to the `mir-json` submodule: - - ``` - $ cd deps/mir-json - ``` - -3. Follow the instructions laid out in the [`mir-json` installation - instructions](https://github.com/GaloisInc/mir-json#installation-instructions) - in order to install `mir-json`. - -4. Navigate to the - [`crux-mir`](https://github.com/GaloisInc/crucible/tree/master/crux-mir) - subdirectory of the `crucible` submodule: - - ``` - $ cd ../crucible/crux-mir/ - ``` - -5. Run the `translate_libs.sh` script: - - ``` - $ ./translate_libs.sh - ``` - - This will compile the custom versions of the Rust standard libraries using - `mir-json`, placing the results under the `rlibs` subdirectory. - -6. Finally, define a `SAW_RUST_LIBRARY_PATH` environment variable that points - to the newly created `rlibs` subdirectory: - - ``` - $ export SAW_RUST_LIBRARY_PATH=<...>/crucible/crux-mir/rlibs - ``` - -For `cargo`-based projects, `mir-json` provides a `cargo` subcommand called -`cargo saw-build` that builds a JSON file suitable for use with SAW. `cargo -saw-build` integrates directly with `cargo`, so you can pass flags to it like -any other `cargo` subcommand. For example: - -``` -# Make sure that SAW_RUST_LIBRARY_PATH is defined, as described above -$ cargo saw-build - -linking 11 mir files into <...>/example-364cf2df365c7055.linked-mir.json - -``` - -Note that: - -* The full output of `cargo saw-build` here is omitted. The important part is - the `.linked-mir.json` file that appears after `linking X mir files into`, as - that is the JSON file that must be loaded with SAW. -* `SAW_RUST_LIBRARY_PATH` should point to the the MIR JSON files for the Rust - standard library. - -`mir-json` also supports compiling individual `.rs` files through `mir-json`'s -`saw-rustc` command. As the name suggests, it accepts all of the flags that -`rustc` accepts. For example: - -``` -# Make sure that SAW_RUST_LIBRARY_PATH is defined, as described above -$ saw-rustc example.rs - -linking 11 mir files into <...>/example.linked-mir.json - -``` - -## Notes on C++ Analysis - -The distance between C++ code and LLVM is greater than between C and -LLVM, so some additional considerations come into play when analyzing -C++ code with SAW. - -The first key issue is that the C++ standard library is large and -complex, and tends to be widely used by C++ applications. To analyze -most C++ code, it will be necessary to link your code with a version of -the `libc++` library[^2] compiled to LLVM bitcode. The `wllvm` program -can[^3] be useful for this. - -The C++ standard library includes a number of key global variables, and -any code that touches them will require that they be initialized using -`llvm_alloc_global`. - -Many C++ names are slightly awkward to deal with in SAW. They may be -mangled relative to the text that appears in the C++ source code. SAW -currently only understands the mangled names. The `llvm-nm` program can -be used to show the list of symbols in an LLVM bitcode file, and the -`c++filt` program can be used to demangle them, which can help in -identifying the symbol you want to refer to. In addition, C++ names from -namespaces can sometimes include quote marks in their LLVM encoding. For -example: - -~~~~ -%"class.quux::Foo" = type { i32, i32 } -~~~~ - -This can be mentioned in SAW by saying: - -~~~~ -llvm_type "%\"class.quux::Foo\"" -~~~~ - -Finally, there is no support for calling constructors in specifications, -so you will need to construct objects piece-by-piece using, *e.g.*, -`llvm_alloc` and `llvm_points_to`. - -[^2]: https://libcxx.llvm.org/docs/BuildingLibcxx.html -[^3]: https://github.com/travitch/whole-program-llvm - -# Direct Extraction - -In the case of the `max` function described earlier, the relevant inputs -and outputs are immediately apparent. The function takes two integer -arguments, always uses both of them, and returns a single integer value, -making no other changes to the program state. - -In cases like this, a direct translation is possible, given only an -identification of which code to execute. Two functions exist to handle -such simple code. The first, for LLVM is the more stable of the two: - -* `llvm_extract : LLVMModule -> String -> TopLevel Term` - -A similar function exists for Java, but is more experimental. - -* `jvm_extract : JavaClass -> String -> TopLevel Term` - -Because of its lack of maturity, it (and later Java-related commands) -must be enabled by running the `enable_experimental` command beforehand. - -* `enable_experimental : TopLevel ()` - -The structure of these two extraction functions is essentially -identical. The first argument describes where to look for code (in -either a Java class or an LLVM module, loaded as described in the -previous section). The second argument is the name of the method or -function to extract. - -When the extraction functions complete, they return a `Term` -corresponding to the value returned by the function or method as a -function of its arguments. - -These functions currently work only for code that takes some fixed -number of integral parameters, returns an integral result, and does not -access any dynamically-allocated memory (although temporary memory -allocated during execution is allowed). - -# Creating Symbolic Variables - -The direct extraction process just discussed automatically introduces -symbolic variables and then abstracts over them, yielding a SAWScript -`Term` that reflects the semantics of the original Java, LLVM, or MIR code. -For simple functions, this is often the most convenient interface. For -more complex code, however, it can be necessary (or more natural) to -specifically introduce fresh variables and indicate what portions of the -program state they correspond to. - -* `fresh_symbolic : String -> Type -> TopLevel Term` is responsible for -creating new variables in this context. The first argument is a name -used for pretty-printing of terms and counter-examples. In many cases it -makes sense for this to be the same as the name used within SAWScript, -as in the following: - -~~~~ -x <- fresh_symbolic "x" ty; -~~~~ - -However, using the same name is not required. - -The second argument to `fresh_symbolic` is the type of the fresh -variable. Ultimately, this will be a SAWCore type; however, it is usually -convenient to specify it using Cryptol syntax with the type quoting -brackets `{|` and `|}`. For example, creating a 32-bit integer, as -might be used to represent a Java `int` or an LLVM `i32`, can be done as -follows: - -~~~~ -x <- fresh_symbolic "x" {| [32] |}; -~~~~ - -Although symbolic execution works best on symbolic variables, which are -"unbound" or "free", most of the proof infrastructure within SAW uses -variables that are *bound* by an enclosing lambda expression. Given a -`Term` with free symbolic variables, we can construct a lambda term that -binds them in several ways. - -* `abstract_symbolic : Term -> Term` finds all symbolic variables in the -`Term` and constructs a lambda expression binding each one, in some -order. The result is a function of some number of arguments, one for -each symbolic variable. It is the simplest but least flexible way to -bind symbolic variables. - -~~~~ -sawscript> x <- fresh_symbolic "x" {| [8] |} -sawscript> let t = {{ x + x }} -sawscript> print_term t -let { x@1 = Prelude.Vec 8 Prelude.Bool - } - in Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) - x - x -sawscript> let f = abstract_symbolic t -sawscript> print_term f -let { x@1 = Prelude.Vec 8 Prelude.Bool - } - in \(x : x@1) -> - Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) x x -~~~~ - -If there are multiple symbolic variables in the `Term` passed to -`abstract_symbolic`, the ordering of parameters can be hard to predict. -In some cases (such as when a proof is the immediate next step, and it's -expected to succeed) the order isn't important. In others, it's nice to -have more control over the order. - -* `lambda : Term -> Term -> Term` is the building block for controlled -binding. It takes two terms: the one to transform, and the portion of -the term to abstract over. Generally, the first `Term` is one obtained -from `fresh_symbolic` and the second is a `Term` that would be passed to -`abstract_symbolic`. - -~~~~ -sawscript> let f = lambda x t -sawscript> print_term f -let { x@1 = Prelude.Vec 8 Prelude.Bool - } - in \(x : x@1) -> - Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) x x -~~~~ - -* `lambdas : [Term] -> Term -> Term` allows you to list the order in which -symbolic variables should be bound. Consider, for example, a `Term` -which adds two symbolic variables: - -~~~~ -sawscript> x1 <- fresh_symbolic "x1" {| [8] |} -sawscript> x2 <- fresh_symbolic "x2" {| [8] |} -sawscript> let t = {{ x1 + x2 }} -sawscript> print_term t -let { x@1 = Prelude.Vec 8 Prelude.Bool - } - in Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) - x1 - x2 -~~~~ - -We can turn `t` into a function that takes `x1` followed by `x2`: - -~~~~ -sawscript> let f1 = lambdas [x1, x2] t -sawscript> print_term f1 -let { x@1 = Prelude.Vec 8 Prelude.Bool - } - in \(x1 : x@1) -> - \(x2 : x@1) -> - Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) x1 - x2 -~~~~ - -Or we can turn `t` into a function that takes `x2` followed by `x1`: - -~~~~ -sawscript> let f1 = lambdas [x2, x1] t -sawscript> print_term f1 -let { x@1 = Prelude.Vec 8 Prelude.Bool - } - in \(x2 : x@1) -> - \(x1 : x@1) -> - Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) x1 - x2 -~~~~ - -# Specification-Based Verification - -The built-in functions described so far work by extracting models of -code that can then be used for a variety of purposes, including proofs -about the properties of the code. - -When the goal is to prove equivalence between some LLVM, Java, or MIR code and -a specification, however, a more declarative approach is sometimes -convenient. The following sections describe an approach that combines -model extraction and verification with respect to a specification. A -verified specification can then be used as input to future -verifications, allowing the proof process to be decomposed. - -## Running a Verification - -Verification of LLVM is controlled by the `llvm_verify` command. - -~~~~ -llvm_verify : - LLVMModule -> - String -> - [CrucibleMethodSpec] -> - Bool -> - LLVMSetup () -> - ProofScript SatResult -> - TopLevel CrucibleMethodSpec -~~~~ - -The first two arguments specify the module and function name to verify, -as with `llvm_verify`. The third argument specifies the list of -already-verified specifications to use for compositional verification -(described later; use `[]` for now). The fourth argument specifies -whether to do path satisfiability checking, and the fifth gives the -specification of the function to be verified. Finally, the last argument -gives the proof script to use for verification. The result is a proved -specification that can be used to simplify verification of functions -that call this one. - -Similar commands are available for JVM programs: - -~~~~ -jvm_verify : - JavaClass -> - String -> - [JVMMethodSpec] -> - Bool -> - JVMSetup () -> - ProofScript SatResult -> - TopLevel JVMMethodSpec -~~~~ - -And for MIR programs: - -~~~~ -mir_verify : - MIRModule -> - String -> - [MIRSpec] -> - Bool -> - MIRSetup () -> - ProofScript () -> - TopLevel MIRSpec -~~~~ - -### Running a MIR-based verification - -(Note: API functions involving MIR verification require `enable_experimental` -in order to be used. As such, some parts of this API may change before being -finalized.) - -The `String` supplied as an argument to `mir_verify` is expected to be a -function _identifier_. An identifier is expected adhere to one of the following -conventions: - -* `/::` -* `::` - -Where: - -* `` is the name of the crate in which the function is defined. (If - you produced your MIR JSON file by compiling a single `.rs` file with - `saw-rustc`, then the crate name is the same as the name of the file, but - without the `.rs` file extension.) -* `` is a hash of the crate and its dependencies. In extreme - cases, it is possible for two different crates to have identical crate names, - in which case the disambiguator must be used to distinguish between the two - crates. In the common case, however, most crate names will correspond to - exactly one disambiguator, and you are allowed to leave out the - `/` part of the `String` in this case. If you supply an - identifier with an ambiguous crate name and omit the disambiguator, then SAW - will raise an error. -* `` is the path to the function within the crate. Sometimes, - this is as simple as the function name itself. In other cases, a function - path may involve multiple _segments_, depending on the module hierarchy for - the program being verified. For instance, a `read` function located in - `core/src/ptr/mod.rs` will have the identifier: - - ``` - core::ptr::read - ``` - - Where `core` is the crate name and `ptr::read` is the function path, which - has two segments `ptr` and `read`. There are also some special forms of - segments that appear for functions defined in certain language constructs. - For instance, if a function is defined in an `impl` block, then it will have - `{impl}` as one of its segments, e.g., - - ``` - core::ptr::const_ptr::{impl}::offset - ``` - - If you are in doubt about what the full identifier for a given function is, - consult the MIR JSON file for your program. - ------ - -Now we describe how to construct a value of type `LLVMSetup ()`, `JVMSetup ()`, -or `MIRSetup ()`. - -## Structure of a Specification - -A specifications for Crucible consists of three logical components: - -* A specification of the initial state before execution of the function. - -* A description of how to call the function within that state. - -* A specification of the expected final value of the program state. - -These three portions of the specification are written in sequence within a `do` -block of type `{LLVM,JVM,MIR}Setup`. The command `{llvm,jvm,mir}_execute_func` -separates the specification of the initial state from the specification of the -final state, and specifies the arguments to the function in terms of the -initial state. Most of the commands available for state description will work -either before or after `{llvm,jvm,mir}_execute_func`, though with slightly -different meaning, as described below. - -## Creating Fresh Variables - -In any case where you want to prove a property of a function for an entire -class of inputs (perhaps all inputs) rather than concrete values, the initial -values of at least some elements of the program state must contain fresh -variables. These are created in a specification with the -`{llvm,jvm,mir}_fresh_var` commands rather than `fresh_symbolic`. - -* `llvm_fresh_var : String -> LLVMType -> LLVMSetup Term` - -* `jvm_fresh_var : String -> JavaType -> JVMSetup Term` - -* `mir_fresh_var : String -> MIRType -> MIRSetup Term` - -The first parameter to both functions is a name, used only for -presentation. It's possible (though not recommended) to create multiple -variables with the same name, but SAW will distinguish between them -internally. The second parameter is the LLVM, Java, or MIR type of the -variable. The resulting `Term` can be used in various subsequent -commands. - -Note that the second parameter to `{llvm,jvm,mir}_fresh_var` must be a type -that has a counterpart in Cryptol. (For more information on this, refer to the -"Cryptol type correspondence" section.) If the type does not have a Cryptol -counterpart, the function will raise an error. If you do need to create a fresh -value of a type that cannot be represented in Cryptol, consider using a -function such as `llvm_fresh_expanded_val` (for LLVM verification) or -`mir_fresh_expanded_value` (for MIR verification). - -LLVM types are built with this set of functions: - -* `llvm_int : Int -> LLVMType` -* `llvm_alias : String -> LLVMType` -* `llvm_array : Int -> LLVMType -> LLVMType` -* `llvm_float : LLVMType` -* `llvm_double : LLVMType` -* `llvm_packed_struct : [LLVMType] -> LLVMType` -* `llvm_struct_type : [LLVMType] -> LLVMType` - -Java types are built up using the following functions: - -* `java_bool : JavaType` -* `java_byte : JavaType` -* `java_char : JavaType` -* `java_short : JavaType` -* `java_int : JavaType` -* `java_long : JavaType` -* `java_float : JavaType` -* `java_double : JavaType` -* `java_class : String -> JavaType` -* `java_array : Int -> JavaType -> JavaType` - -MIR types are built up using the following functions: - -* `mir_adt : MIRAdt -> MIRType` -* `mir_array : Int -> MIRType -> MIRType` -* `mir_bool : MIRType` -* `mir_char : MIRType` -* `mir_i8 : MIRType` -* `mir_i6 : MIRType` -* `mir_i32 : MIRType` -* `mir_i64 : MIRType` -* `mir_i128 : MIRType` -* `mir_isize : MIRType` -* `mir_f32 : MIRType` -* `mir_f64 : MIRType` -* `mir_lifetime : MIRType` -* `mir_ref : MIRType -> MIRType` -* `mir_ref_mut : MIRType -> MIRType` -* `mir_slice : MIRType -> MIRType` -* `mir_str : MIRType` -* `mir_tuple : [MIRType] -> MIRType` -* `mir_u8 : MIRType` -* `mir_u6 : MIRType` -* `mir_u32 : MIRType` -* `mir_u64 : MIRType` -* `mir_u128 : MIRType` -* `mir_usize : MIRType` - -Most of these types are straightforward mappings to the standard LLVM -and Java types. The one key difference is that arrays must have a fixed, -concrete size. Therefore, all analysis results are valid only under the -assumption that any arrays have the specific size indicated, and may not -hold for other sizes. - -The `llvm_int` function takes an `Int` parameter indicating the variable's bit -width. For example, the C `uint16_t` and `int16_t` types correspond to -`llvm_int 16`. The C `bool` type is slightly trickier. A bare `bool` type -typically corresponds to `llvm_int 1`, but if a `bool` is a member of a -composite type such as a pointer, array, or struct, then it corresponds to -`llvm_int 8`. This is due to a peculiarity in the way Clang compiles `bool` -down to LLVM. When in doubt about how a `bool` is represented, check the LLVM -bitcode by compiling your code with `clang -S -emit-llvm`. - -LLVM types can also be specified in LLVM syntax directly by using the -`llvm_type` function. - -* `llvm_type : String -> LLVMType` - -For example, `llvm_type "i32"` yields the same result as `llvm_int 32`. - -The most common use for creating fresh variables is to state that a -particular function should have the specified behaviour for arbitrary -initial values of the variables in question. Sometimes, however, it can -be useful to specify that a function returns (or stores, more about this -later) an arbitrary value, without specifying what that value should be. -To express such a pattern, you can also run `llvm_fresh_var` from -the post state (i.e., after `llvm_execute_func`). - -## The SetupValue and JVMValue Types - -Many specifications require reasoning about both pure values and about -the configuration of the heap. The `SetupValue` type corresponds to -values that can occur during symbolic execution, which includes both -`Term` values, pointers, and composite types consisting of either of -these (both structures and arrays). - -The `llvm_term`, `jvm_term`, and `mir_term` functions create a `SetupValue`, -`JVMValue`, or `MIRValue`, respectively, from a `Term`: - -* `llvm_term : Term -> SetupValue` -* `jvm_term : Term -> JVMValue` -* `mir_term : Term -> MIRValue` - -The value that these functions return will have an LLVM, JVM, or MIR type -corresponding to the Cryptol type of the `Term` argument. (For more information -on this, refer to the "Cryptol type correspondence" section.) If the type does -not have a Cryptol counterpart, the function will raise an error. - -### Cryptol type correspondence - -The `{llvm,jvm,mir}_fresh_var` functions take an LLVM, JVM, or MIR type as an -argument and produces a `Term` variable of the corresponding Cryptol type as -output. Similarly, the `{llvm,jvm,mir}_term` functions take a Cryptol `Term` as -input and produce a value of the corresponding LLVM, JVM, or MIR type as -output. This section describes precisely which types can be converted to -Cryptol types (and vice versa) in this way. - -#### LLVM verification - -The following LLVM types correspond to Cryptol types: - -* `llvm_alias `: Corresponds to the same Cryptol type as the type used - in the definition of ``. -* `llvm_array `: Corresponds to the Cryptol sequence `[][]`, - where `` is the Cryptol type corresponding to ``. -* `llvm_int `: Corresponds to the Cryptol word `[]`. -* `llvm_struct_type [, ..., ]` and `llvm_packed_struct [, ..., ]`: - Corresponds to the Cryptol tuple `(, ..., )`, where `` - is the Cryptol type corresponding to `` for each `i` ranging from `1` - to `n`. - -The following LLVM types do _not_ correspond to Cryptol types: - -* `llvm_double` -* `llvm_float` -* `llvm_pointer` - -#### JVM verification - -The following Java types correspond to Cryptol types: - -* `java_array `: Corresponds to the Cryptol sequence `[][]`, - where `` is the Cryptol type corresponding to ``. -* `java_bool`: Corresponds to the Cryptol `Bit` type. -* `java_byte`: Corresponds to the Cryptol `[8]` type. -* `java_char`: Corresponds to the Cryptol `[16]` type. -* `java_int`: Corresponds to the Cryptol `[32]` type. -* `java_long`: Corresponds to the Cryptol `[64]` type. -* `java_short`: Corresponds to the Cryptol `[16]` type. - -The following Java types do _not_ correspond to Cryptol types: - -* `java_class` -* `java_double` -* `java_float` - -#### MIR verification - -The following MIR types correspond to Cryptol types: - -* `mir_array `: Corresponds to the Cryptol sequence `[][]`, - where `` is the Cryptol type corresponding to ``. -* `mir_bool`: Corresponds to the Cryptol `Bit` type. -* `mir_char`: Corresponds to the Cryptol `[32]` type. -* `mir_i8` and `mir_u8`: Corresponds to the Cryptol `[8]` type. -* `mir_i16` and `mir_u16`: Corresponds to the Cryptol `[16]` type. -* `mir_i32` and `mir_u32`: Corresponds to the Cryptol `[32]` type. -* `mir_i64` and `mir_u64`: Corresponds to the Cryptol `[64]` type. -* `mir_i128` and `mir_u128`: Corresponds to the Cryptol `[128]` type. -* `mir_isize` and `mir_usize`: Corresponds to the Cryptol `[32]` type. -* `mir_tuple [, ..., ]`: Corresponds to the Cryptol tuple - `(, ..., )`, where `` is the Cryptol type corresponding - to `` for each `i` ranging from `1` to `n`. - -The following MIR types do _not_ correspond to Cryptol types: - -* `mir_adt` -* `mir_f32` -* `mir_f64` -* `mir_ref` and `mir_ref_mut` -* `mir_slice` -* `mir_str` - -## Executing - -Once the initial state has been configured, the `{llvm,jvm,mir}_execute_func` -command specifies the parameters of the function being analyzed in terms -of the state elements already configured. - -* `llvm_execute_func : [SetupValue] -> LLVMSetup ()` -* `jvm_execute_func : [JVMValue] -> JVMSetup ()` -* `mir_execute_func : [MIRValue] -> MIRSetup ()` - -## Return Values - -To specify the value that should be returned by the function being -verified use the `{llvm,jvm,mir}_return` command. - -* `llvm_return : SetupValue -> LLVMSetup ()` -* `jvm_return : JVMValue -> JVMSetup ()` -* `mir_return : MIRValue -> MIRSetup ()` - -## A First Simple Example - -The commands introuduced so far are sufficient to verify simple programs -that do not use pointers (or that use them only internally). Consider, -for instance the C program that adds its two arguments together: - -~~~~ -#include -uint32_t add(uint32_t x, uint32_t y) { - return x + y; -} -~~~~ - -We can specify this function's expected behavior as follows: - -~~~~ -let add_setup = do { - x <- llvm_fresh_var "x" (llvm_int 32); - y <- llvm_fresh_var "y" (llvm_int 32); - llvm_execute_func [llvm_term x, llvm_term y]; - llvm_return (llvm_term {{ x + y : [32] }}); -}; -~~~~ - -We can then compile the C file `add.c` into the bitcode file `add.bc` -and verify it with ABC: - -~~~~ -m <- llvm_load_module "add.bc"; -add_ms <- llvm_verify m "add" [] false add_setup abc; -~~~~ - -## Compositional Verification - -The primary advantage of the specification-based approach to -verification is that it allows for compositional reasoning. That is, -when proving properties of a given method or function, we can make use -of properties we have already proved about its callees rather than -analyzing them anew. This enables us to reason about much larger -and more complex systems than otherwise possible. - -The `llvm_verify`, `jvm_verify`, and `mir_verify` functions return values of -type `CrucibleMethodSpec`, `JVMMethodSpec`, and `MIRMethodSpec`, respectively. -These values are opaque objects that internally contain both the information -provided in the associated `LLVMSetup`, `JVMSetup`, or `MIRSetup` blocks, -respectively, and the results of the verification process. - -Any of these `MethodSpec` objects can be passed in via the third -argument of the `..._verify` functions. For any function or method -specified by one of these parameters, the simulator will not follow -calls to the associated target. Instead, it will perform the following -steps: - -* Check that all `llvm_points_to` and `llvm_precond` statements - (or the corresponding JVM or MIR statements) in the specification are - satisfied. - -* Update the simulator state and optionally construct a return value as - described in the specification. - -More concretely, building on the previous example, say we have a -doubling function written in terms of `add`: - -~~~~ -uint32_t dbl(uint32_t x) { - return add(x, x); -} -~~~~ - -It has a similar specification to `add`: - -~~~~ -let dbl_setup = do { - x <- llvm_fresh_var "x" (llvm_int 32); - llvm_execute_func [llvm_term x]; - llvm_return (llvm_term {{ x + x : [32] }}); -}; -~~~~ - -And we can verify it using what we've already proved about `add`: - -~~~~ -llvm_verify m "dbl" [add_ms] false dbl_setup abc; -~~~~ - -In this case, doing the verification compositionally doesn't save -computational effort, since the functions are so simple, but it -illustrates the approach. - -### Compositional Verification and Mutable Allocations - -A common pitfall when using compositional verification is to reuse a -specification that underspecifies the value of a mutable allocation. In -general, doing so can lead to unsound verification, so SAW goes through great -lengths to check for this. - -Here is an example of this pitfall in an LLVM verification. Given this C code: - -~~~ .c -void side_effect(uint32_t *a) { - *a = 0; -} - -uint32_t foo(uint32_t x) { - uint32_t b = x; - side_effect(&b); - return b; -} -~~~ - -And the following SAW specifications: - -~~~ -let side_effect_spec = do { - a_ptr <- llvm_alloc (llvm_int 32); - a_val <- llvm_fresh_var "a_val" (llvm_int 32); - llvm_points_to a_ptr (llvm_term a_val); - - llvm_execute_func [a_ptr]; -}; - -let foo_spec = do { - x <- llvm_fresh_var "x" (llvm_int 32); - - llvm_execute_func [llvm_term x]; - - llvm_return (llvm_term x); -}; -~~~ - -Should SAW be able to verify the `foo` function against `foo_spec` using -compositional verification? That is, should the following be expected to work? - -~~~ -side_effect_ov <- llvm_verify m "side_effect" [] false side_effect_spec z3; -llvm_verify m "foo" [side_effect_ov] false foo_spec z3; -~~~ - -A literal reading of `side_effect_spec` would suggest that the `side_effect` -function allocates `a_ptr` but then does nothing with it, implying that `foo` -returns its argument unchanged. This is incorrect, however, as the -`side_effect` function actually changes its argument to point to `0`, so the -`foo` function ought to return `0` as a result. SAW should not verify `foo` -against `foo_spec`, and indeed it does not. - -The problem is that `side_effect_spec` underspecifies the value of `a_ptr` in -its postconditions, which can lead to the potential unsoundness seen above when -`side_effect_spec` is used in compositional verification. To prevent this -source of unsoundness, SAW will _invalidate_ the underlying memory of any -mutable pointers (i.e., those declared with `llvm_alloc`, not -`llvm_alloc_global`) allocated in the preconditions of compositional override -that do not have a corresponding `llvm_points_to` statement in the -postconditions. Attempting to read from invalidated memory constitutes an -error, as can be seen in this portion of the error message when attempting to -verify `foo` against `foo_spec`: - -~~~ -invalidate (state of memory allocated in precondition (at side.saw:3:12) not described in postcondition) -~~~ - -To fix this particular issue, add an `llvm_points_to` statement to -`side_effect_spec`: - -~~~ -let side_effect_spec = do { - a_ptr <- llvm_alloc (llvm_int 32); - a_val <- llvm_fresh_var "a_val" (llvm_int 32); - llvm_points_to a_ptr (llvm_term a_val); - - llvm_execute_func [a_ptr]; - - // This is new - llvm_points_to a_ptr (llvm_term {{ 0 : [32] }}); -}; -~~~ - -After making this change, SAW will reject `foo_spec` for a different reason, as -it claims that `foo` returns its argument unchanged when it actually returns -`0`. - -Note that invalidating memory itself does not constitute an error, so if the -`foo` function never read the value of `b` after calling `side_effect(&b)`, -then there would be no issue. It is only when a function attempts to _read_ -from invalidated memory that an error is thrown. In general, it can be -difficult to predict when a function will or will not read from invalidated -memory, however. For this reason, it is recommended to always specify the -values of mutable allocations in the postconditions of your specs, as it can -avoid pitfalls like the one above. - -The same pitfalls apply to compositional MIR verification, with a couple of key -differences. In MIR verification, mutable references are allocated using -`mir_alloc_mut`. Here is a Rust version of the pitfall program above: - -~~~ .rs -pub fn side_effect(a: &mut u32) { - *a = 0; -} - -pub fn foo(x: u32) -> u32 { - let mut b: u32 = x; - side_effect(&mut b); - b -} -~~~ - -~~~ -let side_effect_spec = do { - a_ref <- mir_alloc_mut mir_u32; - a_val <- mir_fresh_var "a_val" mir_u32; - mir_points_to a_ref (mir_term a_val); - - mir_execute_func [a_ref]; -}; - -let foo_spec = do { - x <- mir_fresh_var "x" mir_u32; - - mir_execute_func [mir_term x]; - - mir_return (mir_term {{ x }}); -}; -~~~ - -Just like above, if you attempted to prove `foo` against `foo_spec` using -compositional verification: - -~~~ -side_effect_ov <- mir_verify m "test::side_effect" [] false side_effect_spec z3; -mir_verify m "test::foo" [side_effect_ov] false foo_spec z3; -~~~ - -Then SAW would throw an error, as `side_effect_spec` underspecifies the value -of `a_ref` in its postconditions. `side_effect_spec` can similarly be repaired -by adding a `mir_points_to` statement involving `a_ref` in `side_effect_spec`'s -postconditions. - -MIR verification differs slightly from LLVM verification in how it catches -underspecified mutable allocations when using compositional overrides. The LLVM -memory model achieves this by invalidating the underlying memory in -underspecified allocations. The MIR memory model, on the other hand, does not -have a direct counterpart to memory invalidation. As a result, any MIR overrides -must specify the values of all mutable allocations in their postconditions, -_even if the function that calls the override never uses the allocations_. - -To illustrate this point more finely, suppose that the `foo` function had -instead been defined like this: - -~~~ .rs -pub fn foo(x: u32) -> u32 { - let mut b: u32 = x; - side_effect(&mut b); - 42 -} -~~~ - -Here, it does not particularly matter what effects the `side_effect` function -has on its argument, as `foo` will now return `42` regardless. Still, if you -attempt to prove `foo` by using `side_effect` as a compositional override, then -it is strictly required that you specify the value of `side_effect`'s argument -in its postconditions, even though the answer that `foo` returns is unaffected -by this. This is in contrast with LLVM verification, where one could get away -without specifying `side_effect`'s argument in this example, as the invalidated -memory in `b` would never be read. - -### Compositional Verification and Mutable Global Variables - -Just like with local mutable allocations (see the previous section), -specifications used in compositional overrides must specify the values of -mutable global variables in their postconditions. To illustrate this using LLVM -verification, here is a variant of the C program from the previous example that -uses a mutable global variable `a`: - -~~~ .c - -uint32_t a = 42; - -void side_effect(void) { - a = 0; -} - -uint32_t foo(void) { - side_effect(); - return a; -} -~~~ - -If we attempted to verify `foo` against this `foo_spec` specification using -compositional verification: - -~~~ -let side_effect_spec = do { - llvm_alloc_global "a"; - llvm_points_to (llvm_global "a") (llvm_global_initializer "a"); - - llvm_execute_func []; -}; - -let foo_spec = do { - llvm_alloc_global "a"; - llvm_points_to (llvm_global "a") (llvm_global_initializer "a"); - - llvm_execute_func []; - - llvm_return (llvm_global_initializer "a"); -}; - -side_effect_ov <- llvm_verify m "side_effect" [] false side_effect_spec z3; -llvm_verify m "foo" [side_effect_ov] false foo_spec z3; -~~~ - -Then SAW would reject it, as `side_effect_spec` does not specify what `a`'s -value should be in its postconditions. Just as with local mutable allocations, -SAW will invalidate the underlying memory in `a`, and subsequently reading from -`a` in the `foo` function will throw an error. The solution is to add an -`llvm_points_to` statement in the postconditions that declares that `a`'s value -is set to `0`. - -The same concerns apply to MIR verification, where mutable global variables are -referred to as `static mut` items. (See the [MIR static -items](#mir-static-items) section for more information). Here is a Rust version -of the program above: - -~~~ .rs -static mut A: u32 = 42; - -pub fn side_effect() { - unsafe { - A = 0; - } -} - -pub fn foo() -> u32 { - side_effect(); - unsafe { A } -} -~~~ - -~~~ -let side_effect_spec = do { - mir_points_to (mir_static "test::A") (mir_static_initializer "test::A"); - - mir_execute_func []; -}; - -let foo_spec = do { - mir_points_to (mir_static "test::A") (mir_static_initializer "test::A"); - - mir_execute_func []; - - mir_return (mir_static_initializer "test::A"); -}; - -side_effect_ov <- mir_verify m "side_effect" [] false side_effect_spec z3; -mir_verify m "foo" [side_effect_ov] false foo_spec z3; -~~~ - -Just as above, we can repair this by adding a `mir_points_to` statement in -`side_effect_spec`'s postconditions that specifies that `A` is set to `0`. - -Recall from the previous section that MIR verification is stricter than LLVM -verification when it comes to specifying mutable allocations in the -postconditions of compositional overrides. This is especially true for mutable -static items. In MIR verification, any compositional overrides must specify the -values of all mutable static items in the entire program in their -postconditions, _even if the function that calls the override never uses the -static items_. For example, if the `foo` function were instead defined like -this: - -~~~ .rs -pub fn foo() -> u32 { - side_effect(); - 42 -} -~~~ - -Then it is still required for `side_effect_spec` to specify what `A`'s value -will be in its postconditions, despite the fact that this has no effect on the -value that `foo` will return. - -## Specifying Heap Layout - -Most functions that operate on pointers expect that certain pointers -point to allocated memory before they are called. The `llvm_alloc` -command allows you to specify that a function expects a particular -pointer to refer to an allocated region appropriate for a specific type. - -* `llvm_alloc : LLVMType -> LLVMSetup SetupValue` - -This command returns a `SetupValue` consisting of a pointer to the -allocated space, which can be used wherever a pointer-valued -`SetupValue` can be used. - -In the initial state, `llvm_alloc` specifies that the function -expects a pointer to allocated space to exist. In the final state, it -specifies that the function itself performs an allocation. - -When using the experimental Java implementation, separate functions -exist for specifying that arrays or objects are allocated: - -* `jvm_alloc_array : Int -> JavaType -> JVMSetup JVMValue` specifies an -array of the given concrete size, with elements of the given type. - -* `jvm_alloc_object : String -> JVMSetup JVMValue` specifies an object -of the given class name. - -The experimental MIR implementation also has a `mir_alloc` function, which -behaves similarly to `llvm_alloc`. `mir_alloc` creates an immutable reference, -but there is also a `mir_alloc_mut` function for creating a mutable reference: - -* `mir_alloc : MIRType -> MIRSetup MIRValue` - -* `mir_alloc_mut : MIRType -> MIRSetup MIRValue` - -MIR tracks whether references are mutable or immutable at the type level, so it -is important to use the right allocation command for a given reference type. - -In LLVM, it's also possible to construct fresh pointers that do not -point to allocated memory (which can be useful for functions that -manipulate pointers but not the values they point to): - -* `llvm_fresh_pointer : LLVMType -> LLVMSetup SetupValue` - -The NULL pointer is called `llvm_null` in LLVM and `jvm_null` in -JVM: - -* `llvm_null : SetupValue` -* `jvm_null : JVMValue` - -One final, slightly more obscure command is the following: - -* `llvm_alloc_readonly : LLVMType -> LLVMSetup SetupValue` - -This works like `llvm_alloc` except that writes to the space -allocated are forbidden. This can be useful for specifying that a -function should take as an argument a pointer to allocated space that it -will not modify. Unlike `llvm_alloc`, regions allocated with -`llvm_alloc_readonly` are allowed to alias other read-only regions. - -## Specifying Heap Values - -Pointers returned by `llvm_alloc`, `jvm_alloc_{array,object}`, or -`mir_alloc{,_mut}` don't initially point to anything. So if you pass such a -pointer directly into a function that tried to dereference it, symbolic -execution will fail with a message about an invalid load. For some functions, -such as those that are intended to initialize data structures (writing to the -memory pointed to, but never reading from it), this sort of uninitialized -memory is appropriate. In most cases, however, it's more useful to state that a -pointer points to some specific (usually symbolic) value, which you can do with -the *points-to* family of commands. - -### LLVM heap values - -LLVM verification primarily uses the `llvm_points_to` command: - -* `llvm_points_to : SetupValue -> SetupValue -> LLVMSetup ()` -takes two `SetupValue` arguments, the first of which must be a pointer, -and states that the memory specified by that pointer should contain the -value given in the second argument (which may be any type of -`SetupValue`). - -When used in the final state, `llvm_points_to` specifies that the -given pointer *should* point to the given value when the function -finishes. - -Occasionally, because C programs frequently reinterpret memory of one -type as another through casts, it can be useful to specify that a -pointer points to a value that does not agree with its static type. - -* `llvm_points_to_untyped : SetupValue -> SetupValue -> -LLVMSetup ()` works like `llvm_points_to` but omits type -checking. Rather than omitting type checking across the board, we -introduced this additional function to make it clear when a type -reinterpretation is intentional. As an alternative, one -may instead use `llvm_cast_pointer` to line up the static types. - -### JVM heap values - -JVM verification has two categories of commands for specifying heap values. -One category consists of the `jvm_*_is` commands, which allow users to directly -specify what value a heap object points to. There are specific commands for -each type of JVM heap object: - -* `jvm_array_is : JVMValue -> Term -> JVMSetup ()` declares that an array (the - first argument) contains a sequence of values (the second argument). -* `jvm_elem_is : JVMValue -> Int -> JVMValue -> JVMSetup ()` declares that an - array (the first argument) has an element at the given index (the second - argument) containing the given value (the third argument). -* `jvm_field_is : JVMValue -> String -> JVMValue -> JVMSetup ()` declares that - an object (the first argument) has a field (the second argument) containing - the given value (the third argument). -* `jvm_static_field_is : String -> JVMValue -> JVMSetup ()` declares that a - named static field (the first argument) contains the given value (the second - argument). By default, the field name is assumed to belong to the same class - as the method being specified. Static fields belonging to other classes can - be selected using the `.` syntax in the first argument. - -Another category consists of the `jvm_modifies_*` commands. Like the `jvm_*_is` -commands, these specify that a JVM heap object points to valid memory, but -unlike the `jvm_*_is` commands, they leave the exact value being pointed to as -unspecified. These are useful for writing partial specifications for methods -that modify some heap value, but without saying anything specific about the new -value. - -* `jvm_modifies_array : JVMValue -> JVMSetup ()` -* `jvm_modifies_elem : JVMValue -> Int -> JVMSetup ()` -* `jvm_modifies_field : JVMValue -> String -> JVMSetup ()` -* `jvm_modifies_static_field : String -> JVMSetup ()` - -### MIR heap values - -MIR verification has a single `mir_points_to` command: - -* `mir_points_to : MIRValue -> MIRValue -> MIRSetup ()` -takes two `SetupValue` arguments, the first of which must be a reference, -and states that the memory specified by that reference should contain the -value given in the second argument (which may be any type of -`SetupValue`). - -## Working with Compound Types - -The commands mentioned so far give us no way to specify the values of -compound types (arrays or `struct`s). Compound values can be dealt with -either piecewise or in their entirety. - -* `llvm_elem : SetupValue -> Int -> SetupValue` yields a pointer to -an internal element of a compound value. For arrays, the `Int` parameter -is the array index. For `struct` values, it is the field index. - -* `llvm_field : SetupValue -> String -> SetupValue` yields a pointer -to a particular named `struct` field, if debugging information is -available in the bitcode. - -Either of these functions can be used with `llvm_points_to` to -specify the value of a particular array element or `struct` field. -Sometimes, however, it is more convenient to specify all array elements -or field values at once. The `llvm_array_value` and `llvm_struct_value` -functions construct compound values from lists of element values. - -* `llvm_array_value : [SetupValue] -> SetupValue` -* `llvm_struct_value : [SetupValue] -> SetupValue` - -To specify an array or struct in which each element or field is -symbolic, it would be possible, but tedious, to use a large combination -of `llvm_fresh_var` and `llvm_elem` or `llvm_field` commands. -However, the following function can simplify the common case -where you want every element or field to have a fresh value. - -* `llvm_fresh_expanded_val : LLVMType -> LLVMSetup SetupValue` - -The `llvm_struct_value` function normally creates a `struct` whose layout -obeys the alignment rules of the platform specified in the LLVM file -being analyzed. Structs in LLVM can explicitly be "packed", however, so -that every field immediately follows the previous in memory. The -following command will create values of such types: - -* `llvm_packed_struct_value : [SetupValue] -> SetupValue` - -C programs will sometimes make use of pointer casting to implement -various kinds of polymorphic behaviors, either via direct pointer -casts, or by using `union` types to codify the pattern. To reason -about such cases, the following operation is useful. - -* `llvm_cast_pointer : SetupValue -> LLVMType -> SetupValue` - -This function function casts the type of the input value (which must be a -pointer) so that it points to values of the given type. This mainly -affects the results of subsequent `llvm_field` and `llvm_elem` calls, -and any eventual `points_to` statements that the resulting pointer -flows into. This is especially useful for dealing with C `union` -types, as the type information provided by LLVM is imprecise in these -cases. - -We can automate the process of applying pointer casts if we have debug -information avaliable: - -* `llvm_union : SetupValue -> String -> SetupValue` - -Given a pointer setup value, this attempts to select the named union -branch and cast the type of the pointer. For this to work, debug -symbols must be included; moreover, the process of correlating LLVM -type information with information contained in debug symbols is a bit -heuristic. If `llvm_union` cannot figure out how to cast a pointer, -one can fall back on the more manual `llvm_cast_pointer` instead. - - -In the experimental Java verification implementation, the following -functions can be used to state the equivalent of a combination of -`llvm_points_to` and either `llvm_elem` or `llvm_field`. - -* `jvm_elem_is : JVMValue -> Int -> JVMValue -> JVMSetup ()` specifies -the value of an array element. - -* `jvm_field_is : JVMValue -> String -> JVMValue -> JVMSetup ()` -specifies the name of an object field. - -In the experimental MIR verification implementation, the following functions -construct compound values: - -* `mir_array_value : MIRType -> [MIRValue] -> MIRValue` constructs an array - of the given type whose elements consist of the given values. Supplying the - element type is necessary to support length-0 arrays. -* `mir_enum_value : MIRAdt -> String -> [MIRValue] -> MIRValue` constructs an - enum using a particular enum variant. The `MIRAdt` arguments determines what - enum type to create, the `String` value determines the name of the variant to - use, and the `[MIRValue]` list are the values to use as elements in the - variant. - - See the "Finding MIR algebraic data types" section (as well as the "Enums" - subsection) for more information on how to compute a `MIRAdt` value to pass - to `mir_enum_value`. -* `mir_slice_value : MIRValue -> MIRValue`: see the "MIR slices" section below. -* `mir_slice_range_value : MIRValue -> Int -> Int -> MIRValue`: see the - "MIR slices" section below. -* `mir_str_slice_value : MIRValue -> MIRValue`: see the "MIR slices" section - below. -* `mir_str_slice_range_value : MIRValue -> Int -> Int -> MIRValue`: see the - "MIR slices" section below. -* `mir_struct_value : MIRAdt -> [MIRValue] -> MIRValue` construct a struct - with the given list of values as elements. The `MIRAdt` argument determines - what struct type to create. - - See the "Finding MIR algebraic data types" section for more information on how - to compute a `MIRAdt` value to pass to `mir_struct_value`. -* `mir_tuple_value : [MIRValue] -> MIRValue` construct a tuple with the given - list of values as elements. - -To specify a compound value in which each element or field is symbolic, it -would be possible, but tedious, to use a large number of `mir_fresh_var` -invocations in conjunction with the commands above. However, the following -function can simplify the common case where you want every element or field to -have a fresh value: - -* `mir_fresh_expanded_value : String -> MIRType -> MIRSetup MIRValue` - -The `String` argument denotes a prefix to use when generating the names of -fresh symbolic variables. The `MIRType` can be any type, with the exception of -reference types (or compound types that contain references as elements or -fields), which are not currently supported. - -### MIR slices - -Slices are a unique form of compound type that is currently only used during -MIR verification. Unlike other forms of compound values, such as arrays, it is -not possible to directly construct a slice. Instead, one must take a slice of -an existing reference value that points to the thing being sliced. - -SAW currently supports taking slices of arrays and strings. - -#### Array slices - -The following commands are used to construct slices of arrays: - -* `mir_slice_value : MIRValue -> MIRValue`: the SAWScript expression - `mir_slice_value base` is equivalent to the Rust expression `&base[..]`, - i.e., a slice of the entirety of `base`. `base` must be a reference to an - array value (`&[T; N]` or `&mut [T; N]`), not an array itself. The type of - `mir_slice_value base` will be `&[T]` (if `base` is an immutable reference) - or `&mut [T]` (if `base` is a mutable reference). -* `mir_slice_range_value : MIRValue -> Int -> Int -> MIRValue`: the SAWScript - expression `mir_slice_range_value base start end` is equivalent to the Rust - expression `&base[start..end]`, i.e., a slice over a part of `base` which - ranges from `start` to `end`. `base` must be a reference to an array value - (`&[T; N]` or `&mut [T; N]`), not an array itself. The type of - `mir_slice_value base` will be `&[T]` (if `base` is an immutable reference) - or `&mut [T]` (if `base` is a mutable reference). - - `start` and `end` are assumed to be zero-indexed. `start` must not exceed - `end`, and `end` must not exceed the length of the array that `base` points - to. - -As an example of how to use these functions, consider this Rust function, which -accepts an arbitrary slice as an argument: - -~~~~ .rs -pub fn f(s: &[u32]) -> u32 { - s[0] + s[1] -} -~~~~ - -We can write a specification that passes a slice to the array `[1, 2, 3, 4, 5]` -as an argument to `f`: - -~~~~ -let f_spec_1 = do { - a <- mir_alloc (mir_array 5 mir_u32); - mir_points_to a (mir_term {{ [1, 2, 3, 4, 5] : [5][32] }}); - - mir_execute_func [mir_slice_value a]; - - mir_return (mir_term {{ 3 : [32] }}); -}; -~~~~ - -Alternatively, we can write a specification that passes a part of this array -over the range `[1..3]`, i.e., ranging from second element to the fourth. -Because this is a half-open range, the resulting slice has length 2: - -~~~~ -let f_spec_2 = do { - a <- mir_alloc (mir_array 5 mir_u32); - mir_points_to a (mir_term {{ [1, 2, 3, 4, 5] : [5][32] }}); - - mir_execute_func [mir_slice_range_value a 1 3]; - - mir_return (mir_term {{ 5 : [32] }}); -}; -~~~~ - -Note that we are passing _references_ of arrays to `mir_slice_value` and -`mir_slice_range_value`. It would be an error to pass a bare array to these -functions, so the following specification would be invalid: - -~~~~ -let f_fail_spec_ = do { - let arr = mir_term {{ [1, 2, 3, 4, 5] : [5][32] }}; - - mir_execute_func [mir_slice_value arr]; // BAD: `arr` is not a reference - - mir_return (mir_term {{ 3 : [32] }}); -}; -~~~~ - -Note that The `mir_slice_range_value` function must accept bare `Int` arguments -to specify the lower and upper bounds of the range. A consequence of this -design is that it is not possible to create a slice with a symbolic length. If -this limitation prevents you from using SAW, please file an issue [on -GitHub](https://github.com/GaloisInc/saw-script/issues). - -#### String slices - -In addition to slices of arrays (i.e., of type `&[T]`), SAW also supports -slices of strings (i.e., of type `&str`) through the following commands: - -* `mir_str_slice_value : MIRValue -> MIRValue`: the SAWScript expression - `mir_str_slice_value base` is equivalent to the Rust expression `&base[..]`, - i.e., a slice of the entirety of `base`. `base` must be a reference to an - array of bytes (`&[u8; N]` or `&mut [u8; N]`), not an array itself. The type - of `mir_str_slice_value base` will be `&str` (if `base` is an immutable - reference) or `&mut str` (if `base` is a mutable reference). -* `mir_str_slice_range_value : MIRValue -> Int -> Int -> MIRValue`: the - SAWScript expression `mir_slice_range_value base start end` is equivalent to - the Rust expression `&base[start..end]`, i.e., a slice over a part of `base` - which ranges from `start` to `end`. `base` must be a reference to an array of - bytes (`&[u8; N]` or `&mut [u8; N]`), not an array itself. The type of - `mir_slice_value base` will be `&str` (if `base` is an immutable reference) - or `&mut str` (if `base` is a mutable reference). - - `start` and `end` are assumed to be zero-indexed. `start` must not exceed - `end`, and `end` must not exceed the length of the array that `base` points - to. - -One unusual requirement about `mir_str_slice_value` and -`mir_str_slice_range_value` is that they require the argument to be of type -`&[u8; N]`, i.e., a reference to an array of bytes. This is an artifact of the -way that strings are encoded in Cryptol. The following Cryptol expressions: - -* `"A"` -* `"123"` -* `"Hello World"` - -Have the following types: - -* `[1][8]` -* `[3][8]` -* `[11][8]` - -This is because Cryptol strings are syntactic shorthand for sequences of bytes. -The following Cryptol expressions are wholly equivalent: - -* `[0x41]` -* `[0x31, 0x32, 0x33]` -* `[0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64]` - -These represent the strings in the extended ASCII character encoding. The -Cryptol sequence type `[N][8]` is equivalent to the Rust type `[u8; N]`, so the -requirement to have something of type `&[u8; N]` as an argument reflects this -design choice. - -Note that `mir_str_slice_value ` is _not_ the same thing as -`mir_slice_value `, as the two commands represent different types -of Rust values. While both commands take a `` as an argument, -`mir_str_slice_value` will return a value of Rust type `&str` (or `&mut str`), -whereas `mir_slice_value` will return a value of Rust type `&[u8]` (or `&mut -[u8]`). These Rust types are checked when you pass these values as arguments to -Rust functions (using `mir_execute_func`) or when you return these values -(using `mir_return`), and it is an error to supply a `&str` value in a place -where a `&[u8]` value is expected (and vice versa). - -As an example of how to write specifications involving string slices, -consider this Rust function: - -~~~ .rs -pub fn my_len(s: &str) -> usize { - s.len() -} -~~~ - -We can use `mir_str_slice_value` to write a specification for `my_len` when it -is given the string `"hello"` as an argument: - -~~~ -let my_len_spec = do { - s <- mir_alloc (mir_array 5 mir_u8); - mir_points_to s (mir_term {{ "hello" }}); - - mir_execute_func [mir_str_slice_value s]; - - mir_return (mir_term {{ 5 : [32] }}); -}; -~~~ - -Currently, Cryptol only supports characters that can be encoded in a single -byte. As a result, it is not currently possible to take slices of strings with -certain characters. For example, the string `"roșu"` cannot be used as a -Cryptol expression, as the character `'ș'` would require 10 bits to represent -instead of 8. The alternative is to use UTF-8 to encode such characters. For -instance, the UTF-8 encoding of the string `"roșu"` is `"ro\200\153u"`, where -`"\200\153"` is a sequence of two bytes that represents the `'ș'` character. - -SAW makes no attempt to ensure that string slices over a particular range -aligns with UTF-8 character boundaries. For example, the following Rust code -would panic: - -~~~ .rs - let rosu: &str = "roșu"; - let s: &str = &rosu[0..3]; - println!("{:?}", s); -~~~ -~~~ -thread 'main' panicked at 'byte index 3 is not a char boundary; it is inside 'ș' (bytes 2..4) of `roșu`' -~~~ - -On the other hand, SAW will allow you define a slice of the form -`mir_str_slice_range r 0 3`, where `r` is a reference to `"ro\200\153u"`. It is -the responsibility of the SAW user to ensure that `mir_str_slice_range` indices -align with character boundaries. - -### Finding MIR algebraic data types - -We collectively refer to MIR `struct`s and `enum`s together as _algebraic data -types_, or ADTs for short. ADTs have identifiers to tell them apart, and a -single ADT declaration can give rise to multiple identifiers depending on how -the declaration is used. For example: - -~~~~ .rs -pub struct S { - pub x: A, - pub y: B, -} - -pub fn f() -> S { - S { - x: 1, - y: 2, - } -} - -pub fn g() -> S { - S { - x: 3, - y: 4, - } -} -~~~~ - -This program as a single `struct` declaration `S`, which is used in the -functions `f` and `g`. Note that `S`'s declaration is _polymorphic_, as it uses -type parameters, but the uses of `S` in `f` and `g` are _monomorphic_, as `S`'s -type parameters are fully instantiated. Each unique, monomorphic instantiation -of an ADT gives rise to its own identifier. In the example above, this might -mean that the following identifiers are created when this code is compiled with -`mir-json`: - -* `S` gives rise to `example/abcd123::S::_adt456` -* `S` gives rise to `example/abcd123::S::_adt789` - -The suffix `_adt` is autogenerated by `mir-json` and is typically -difficult for humans to guess. For this reason, we offer a command to look up -an ADT more easily: - -* `mir_find_adt : MIRModule -> String -> [MIRType] -> MIRAdt` consults the - given `MIRModule` to find an algebraic data type (`MIRAdt`). It uses the given - `String` as an identifier and the given MIRTypes as the types to instantiate - the type parameters of the ADT. If such a `MIRAdt` cannot be found in the - `MIRModule`, this will raise an error. - -Note that the `String` argument to `mir_find_adt` does not need to include the -`_adt` suffix, as `mir_find_adt` will discover this for you. The `String` -is expected to adhere to the identifier conventions described in the "Running a -MIR-based verification" section. For instance, the following two lines will -look up `S` and `S` from the example above as `MIRAdt`s: - -~~~~ -m <- mir_load_module "example.linked-mir.json"; - -let s_8_16 = mir_find_adt m "example::S" [mir_u8, mir_u16]; -let s_32_64 = mir_find_adt m "example::S" [mir_u32, mir_u64]; -~~~~ - -The `mir_adt` command (for constructing a struct type), `mir_struct_value` (for -constructing a struct value), and `mir_enum_value` (for constructing an enum -value) commands in turn take a `MIRAdt` as an argument. - -#### Enums - -In addition to taking a `MIRAdt` as an argument, `mir_enum_value` also takes a -`String` representing the name of the variant to construct. The variant name -should be a short name such as `"None"` or `"Some"`, and not a full identifier -such as `"core::option::Option::None"` or `"core::option::Option::Some"`. This -is because the `MIRAdt` already contains the full identifiers for all of an -enum's variants, so SAW will use this information to look up a variant's -identifier from a short name. Here is an example of using `mir_enum_value` in -practice: - -~~~~ .rs -pub fn n() -> Option { - None -} - -pub fn s(x: u32) -> Option { - Some(x) -} -~~~~ -~~~~ -m <- mir_load_module "example.linked-mir.json"; - -let option_u32 = mir_find_adt m "core::option::Option" [mir_u32]; - -let n_spec = do { - mir_execute_func []; - - mir_return (mir_enum_value option_u32 "None" []); -}; - -let s_spec = do { - x <- mir_fresh_var "x" mir_u32; - - mir_execute_func [mir_term x]; - - mir_return (mir_enum_value option_u32 "Some" [mir_term x]); -}; -~~~~ - -Note that `mir_enum_value` can only be used to construct a specific variant. If -you need to construct a symbolic enum value that can range over many potential -variants, use `mir_fresh_expanded_value` instead. - -#### Lifetimes - -Rust ADTs can have both type parameters as well as _lifetime_ parameters. The -following Rust code declares a lifetime parameter `'a` on the struct `S`, as -well on the function `f` that computes an `S` value: - -~~~~ .rs -pub struct S<'a> { - pub x: &'a u32, -} - -pub fn f<'a>(y: &'a u32) -> S<'a> { - S { x: y } -} -~~~~ - -When `mir-json` compiles a piece of Rust code that contains lifetime -parameters, it will instantiate all of the lifetime parameters with a -placeholder MIR type that is simply called `lifetime`. This is important to -keep in mind when looking up ADTs with `mir_find_adt`, as you will also need to -indicate to SAW that the lifetime parameter is instantiated with `lifetime`. In -order to do so, use `mir_lifetime`. For example, here is how to look up `S` -with `'a` instantiated to `lifetime`: - -~~~~ -s_adt = mir_find_adt m "example::S" [mir_lifetime] -~~~~ - -Note that this part of SAW's design is subject to change in the future. -Ideally, users would not have to care about lifetimes at all at the MIR level; -see [this issue](https://github.com/GaloisInc/mir-json/issues/58) for further -discussion on this point. If that issue is fixed, then we will likely remove -`mir_lifetime`, as it will no longer be necessary. - -### Bitfields - -SAW has experimental support for specifying `struct`s with bitfields, such as -in the following example: - -~~~~ .c -struct s { - uint8_t x:1; - uint8_t y:1; -}; -~~~~ - -Normally, a `struct` with two `uint8_t` fields would have an overall size of -two bytes. However, because the `x` and `y` fields are declared with bitfield -syntax, they are instead packed together into a single byte. - -Because bitfields have somewhat unusual memory representations in LLVM, some -special care is required to write SAW specifications involving bitfields. For -this reason, there is a dedicated `llvm_points_to_bitfield` function for this -purpose: - -* `llvm_points_to_bitfield : SetupValue -> String -> SetupValue -> LLVMSetup ()` - -The type of `llvm_points_to_bitfield` is similar that of `llvm_points_to`, -except that it takes the name of a field within a bitfield as an additional -argument. For example, here is how to assert that the `y` field in the `struct` -example above should be `0`: - -~~~~ -ss <- llvm_alloc (llvm_alias "struct.s"); -llvm_points_to_bitfield ss "y" (llvm_term {{ 0 : [1] }}); -~~~~ - -Note that the type of the right-hand side value (`0`, in this example) must -be a bitvector whose length is equal to the size of the field within the -bitfield. In this example, the `y` field was declared as `y:1`, so `y`'s value -must be of type `[1]`. - -Note that the following specification is _not_ equivalent to the one above: - -~~~~ -ss <- llvm_alloc (llvm_alias "struct.s"); -llvm_points_to (llvm_field ss "y") (llvm_term {{ 0 : [1] }}); -~~~~ - -`llvm_points_to` works quite differently from `llvm_points_to_bitfield` under -the hood, so using `llvm_points_to` on bitfields will almost certainly not work -as expected. - -In order to use `llvm_points_to_bitfield`, one must also use the -`enable_lax_loads_and_stores` command: - -* `enable_lax_loads_and_stores: TopLevel ()` - -Both `llvm_points_to_bitfield` and `enable_lax_loads_and_stores` are -experimental commands, so these also require using `enable_experimental` before -they can be used. - -The `enable_lax_loads_and_stores` command relaxes some -of SAW's assumptions about uninitialized memory, which is necessary to make -`llvm_points_to_bitfield` work under the hood. For example, reading from -uninitialized memory normally results in an error in SAW, but with -`enable_lax_loads_and_stores`, such a read will instead return a symbolic -value. At present, `enable_lax_loads_and_stores` only works with What4-based -tactics (e.g., `w4_unint_z3`); using it with SBV-based tactics -(e.g., `sbv_unint_z3`) will result in an error. - -Note that SAW relies on LLVM debug metadata in order to determine which struct -fields reside within a bitfield. As a result, you must pass `-g` to `clang` when -compiling code involving bitfields in order for SAW to be able to reason about -them. - -## Global variables - -SAW supports verifying LLVM and MIR specifications involving global variables. - -### LLVM global variables - -Mutable global variables that are accessed in a function must first be allocated -by calling `llvm_alloc_global` on the name of the global. - -* `llvm_alloc_global : String -> LLVMSetup ()` - -This ensures that all global variables that might influence the function are -accounted for explicitly in the specification: if `llvm_alloc_global` is -used in the precondition, there must be a corresponding `llvm_points_to` -in the postcondition describing the new state of that global. Otherwise, a -specification might not fully capture the behavior of the function, potentially -leading to unsoundness in the presence of compositional verification. (For more -details on this point, see the [Compositional Verification and Mutable Global -Variables](#compositional-verification-and-mutable-global-variables) section.) - -Immutable (i.e. `const`) global variables are allocated implicitly, and do not -require a call to `llvm_alloc_global`. - -Pointers to global variables or functions can be accessed with -`llvm_global`: - -* `llvm_global : String -> SetupValue` - -Like the pointers returned by `llvm_alloc`, however, these aren't -initialized at the beginning of symbolic -- setting global variables may -be unsound in the presence of [compositional -verification](#compositional-verification). - -To understand the issues surrounding global variables, consider the following C -code: - - -~~~ -int x = 0; - -int f(int y) { - x = x + 1; - return x + y; -} - -int g(int z) { - x = x + 2; - return x + z; -} -~~~ - -One might initially write the following specifications for `f` and `g`: - - -~~~ -m <- llvm_load_module "./test.bc"; - -f_spec <- llvm_verify m "f" [] true (do { - y <- llvm_fresh_var "y" (llvm_int 32); - llvm_execute_func [llvm_term y]; - llvm_return (llvm_term {{ 1 + y : [32] }}); -}) abc; - -g_spec <- llvm_llvm_verify m "g" [] true (do { - z <- llvm_fresh_var "z" (llvm_int 32); - llvm_execute_func [llvm_term z]; - llvm_return (llvm_term {{ 2 + z : [32] }}); -}) abc; -~~~ - -If globals were always initialized at the beginning of verification, -both of these specs would be provable. However, the results wouldn't -truly be compositional. For instance, it's not the case that `f(g(z)) == -z + 3` for all `z`, because both `f` and `g` modify the global variable -`x` in a way that crosses function boundaries. - -To deal with this, we can use the following function: - -* `llvm_global_initializer : String -> SetupValue` returns the value - of the constant global initializer for the named global variable. - -Given this function, the specifications for `f` and `g` can make this -reliance on the initial value of `x` explicit: - - -~~~ -m <- llvm_load_module "./test.bc"; - - -let init_global name = do { - llvm_alloc_global name; - llvm_points_to (llvm_global name) - (llvm_global_initializer name); -}; - -f_spec <- llvm_verify m "f" [] true (do { - y <- llvm_fresh_var "y" (llvm_int 32); - init_global "x"; - llvm_precond {{ y < 2^^31 - 1 }}; - llvm_execute_func [llvm_term y]; - llvm_return (llvm_term {{ 1 + y : [32] }}); -}) abc; -~~~ - -which initializes `x` to whatever it is initialized to in the C code at -the beginning of verification. This specification is now safe for -compositional verification: SAW won't use the specification `f_spec` -unless it can determine that `x` still has its initial value at the -point of a call to `f`. This specification also constrains `y` to prevent -signed integer overflow resulting from the `x + y` expression in `f`, -which is undefined behavior in C. - -### MIR static items - -Rust's static items are the MIR version of global variables. A reference to a -static item can be accessed with the `mir_static` function. This function takes -a `String` representing a static item's identifier, and this identifier is -expected to adhere to the naming conventions outlined in the "Running a -MIR-based verification" section: - -* `mir_static : String -> MIRValue` - -References to static values can be initialized with the `mir_points_to` -command, just like with other forms of references. Immutable static items -(e.g., `static X: u8 = 42`) are initialized implicitly in every SAW -specification, so there is no need for users to do so manually. Mutable static -items (e.g., `static mut Y: u8 = 27`), on the other hand, are *not* initialized -implicitly, and users must explicitly pick a value to initialize them with. - -The `mir_static_initializer` function can be used to access the initial value -of a static item in a MIR program. Like with `mir_static`, the `String` -supplied as an argument must be a valid identifier: - -* `mir_static_initializer : String -> MIRValue`. - -As an example of how to use these functions, here is a Rust program involving -static items: - -~~~ .rs -// statics.rs -static S1: u8 = 1; -static mut S2: u8 = 2; - -pub fn f() -> u8 { - // Reading a mutable static item requires an `unsafe` block due to - // concurrency-related concerns. We are only concerned about the behavior - // of this program in a single-threaded context, so this is fine. - let s2 = unsafe { S2 }; - S1 + s2 -} -~~~ - -We can write a specification for `f` like so: - -~~~ -// statics.saw -enable_experimental; - -let f_spec = do { - mir_points_to (mir_static "statics::S2") - (mir_static_initializer "statics::S2"); - // Note that we do not initialize S1, as immutable static items are implicitly - // initialized in every specification. - - mir_execute_func []; - - mir_return (mir_term {{ 3 : [8] }}); -}; - -m <- mir_load_module "statics.linked-mir.json"; - -mir_verify m "statics::f" [] false f_spec z3; -~~~ - -In order to use a specification involving mutable static items for -compositional verification, it is required to specify the value of all mutable -static items using the `mir_points_to` command in the specification's -postconditions. For more details on this point, see the [Compositional -Verification and Mutable Global -Variables](#compositional-verification-and-mutable-global-variables) section. - -## Preconditions and Postconditions - -Sometimes a function is only well-defined under certain conditions, or -sometimes you may be interested in certain initial conditions that give -rise to specific final conditions. For these cases, you can specify an -arbitrary predicate as a precondition or post-condition, using any -values in scope at the time. - -* `llvm_precond : Term -> LLVMSetup ()` -* `llvm_postcond : Term -> LLVMSetup ()` -* `llvm_assert : Term -> LLVMSetup ()` -* `jvm_precond : Term -> JVMSetup ()` -* `jvm_postcond : Term -> JVMSetup ()` -* `jvm_assert : Term -> JVMSetup ()` -* `mir_precond : Term -> MIRSetup ()` -* `mir_postcond : Term -> MIRSetup ()` -* `mir_assert : Term -> MIRSetup ()` - -These commands take `Term` arguments, and therefore cannot describe the values -of pointers. The "assert" variants will work in either pre- or post-conditions, -and are useful when defining helper functions that, e.g., provide datastructure -invariants that make sense in both phases. The `{llvm,jvm,mir}_equal` commands -state that two values should be equal, and can be used in either the initial or -the final state. - -* `llvm_equal : SetupValue -> SetupValue -> LLVMSetup ()` -* `jvm_equal : JVMValue -> JVMValue -> JVMSetup ()` -* `mir_equal : MIRValue -> MIRValue -> MIRSetup ()` - -The use of `{llvm,jvm,mir}_equal` can also sometimes lead to more efficient -symbolic execution when the predicate of interest is an equality. - -## Assuming specifications - -Normally, a `MethodSpec` is the result of both simulation and proof of -the target code. However, in some cases, it can be useful to use a -`MethodSpec` to specify some code that either doesn't exist or is hard -to prove. The previously-mentioned [`assume_unsat` -tactic](#miscellaneous-tactics) omits proof but does not prevent -simulation of the function. To skip simulation altogether, one can use -one of the following commands: - -~~~ -llvm_unsafe_assume_spec : - LLVMModule -> String -> LLVMSetup () -> TopLevel CrucibleMethodSpec - -jvm_unsafe_assume_spec : - JavaClass -> String -> JVMSetup () -> TopLevel JVMMethodSpec - -mir_unsafe_assume_spec : - MIRModule -> String -> MIRSetup () -> TopLevel MIRSpec -~~~ - -## A Heap-Based Example - -To tie all of the command descriptions from the previous sections -together, consider the case of verifying the correctness of a C program -that computes the dot product of two vectors, where the length and value -of each vector are encapsulated together in a `struct`. - -The dot product can be concisely specified in Cryptol as follows: - -~~~~ -dotprod : {n, a} (fin n, fin a) => [n][a] -> [n][a] -> [a] -dotprod xs ys = sum (zip (*) xs ys) -~~~~ - -To implement this in C, let's first consider the type of vectors: - -~~~~ .c -typedef struct { - uint32_t *elts; - uint32_t size; -} vec_t; -~~~~ - -This struct contains a pointer to an array of 32-bit elements, and a -32-bit value indicating how many elements that array has. - -We can compute the dot product of two of these vectors with the -following C code (which uses the size of the shorter vector if they -differ in size). - -~~~~ .c -uint32_t dotprod_struct(vec_t *x, vec_t *y) { - uint32_t size = MIN(x->size, y->size); - uint32_t res = 0; - for(size_t i = 0; i < size; i++) { - res += x->elts[i] * y->elts[i]; - } - return res; -} -~~~~ - -The entirety of this implementation can be found in the -`examples/llvm/dotprod_struct.c` file in the `saw-script` repository. - -To verify this program in SAW, it will be convenient to define a couple -of utility functions (which are generally useful for many -heap-manipulating programs). First, combining allocation and -initialization to a specific value can make many scripts more concise: - -~~~~ -let alloc_init ty v = do { - p <- llvm_alloc ty; - llvm_points_to p v; - return p; -}; -~~~~ - -This creates a pointer `p` pointing to enough space to store type `ty`, -and then indicates that the pointer points to value `v` (which should be -of that same type). - -A common case for allocation and initialization together is when the -initial value should be entirely symbolic. - -~~~~ -let ptr_to_fresh n ty = do { - x <- llvm_fresh_var n ty; - p <- alloc_init ty (llvm_term x); - return (x, p); -}; -~~~~ - -This function returns the pointer just allocated along with the fresh -symbolic value it points to. - -Given these two utility functions, the `dotprod_struct` function can be -specified as follows: - -~~~~ -let dotprod_spec n = do { - let nt = llvm_term {{ `n : [32] }}; - (xs, xsp) <- ptr_to_fresh "xs" (llvm_array n (llvm_int 32)); - (ys, ysp) <- ptr_to_fresh "ys" (llvm_array n (llvm_int 32)); - let xval = llvm_struct_value [ xsp, nt ]; - let yval = llvm_struct_value [ ysp, nt ]; - xp <- alloc_init (llvm_alias "struct.vec_t") xval; - yp <- alloc_init (llvm_alias "struct.vec_t") yval; - llvm_execute_func [xp, yp]; - llvm_return (llvm_term {{ dotprod xs ys }}); -}; -~~~~ - -Any instantiation of this specification is for a specific vector length -`n`, and assumes that both input vectors have that length. That length -`n` automatically becomes a type variable in the subsequent Cryptol -expressions, and the backtick operator is used to reify that type as a -bit vector of length 32. - -The entire script can be found in the `dotprod_struct-crucible.saw` file -alongside `dotprod_struct.c`. - -Running this script results in the following: - -~~~~ -Loading file "dotprod_struct.saw" -Proof succeeded! dotprod_struct -Registering override for `dotprod_struct` - variant `dotprod_struct` -Symbolic simulation completed with side conditions. -Proof succeeded! dotprod_wrap -~~~~ - -## Using Ghost State - -In some cases, information relevant to verification is not directly -present in the concrete state of the program being verified. This can -happen for at least two reasons: - -* When providing specifications for external functions, for which source - code is not present. The external code may read and write global state - that is not directly accessible from the code being verified. - -* When the abstract specification of the program naturally uses a - different representation for some data than the concrete - implementation in the code being verified does. - -One solution to these problems is the use of _ghost_ state. This can be -thought of as additional global state that is visible only to the -verifier. Ghost state with a given name can be declared at the top level -with the following function: - -* `declare_ghost_state : String -> TopLevel Ghost` - -Ghost state variables do not initially have any particluar type, and can -store data of any type. Given an existing ghost variable the following -functions can be used to specify its value: - -* `llvm_ghost_value : Ghost -> Term -> LLVMSetup ()` -* `jvm_ghost_value : Ghost -> Term -> JVMSetup ()` -* `mir_ghost_value : Ghost -> Term -> MIRSetup ()` - -These can be used in either the pre state or the post state, to specify the -value of ghost state either before or after the execution of the function, -respectively. - -## An Extended Example - -To tie together many of the concepts in this manual, we now present a -non-trivial verification task in its entirety. All of the code for this example -can be found in the `examples/salsa20` directory of -[the SAWScript repository](https://github.com/GaloisInc/saw-script). - -### Salsa20 Overview - -Salsa20 is a stream cipher developed in 2005 by Daniel J. Bernstein, built on a -pseudorandom function utilizing add-rotate-XOR (ARX) operations on 32-bit -words[^4]. Bernstein himself has provided several public domain -implementations of the cipher, optimized for common machine architectures. -For the mathematically inclined, his specification for the cipher can be -found [here](http://cr.yp.to/snuffle/spec.pdf). - -The repository referenced above contains three implementations of the Salsa20 -cipher: A reference Cryptol implementation (which we take as correct in this -example), and two C implementations, one of which is from Bernstein himself. -For this example, we focus on the second of these C -implementations, which more closely matches the Cryptol implementation. Full -verification of Bernstein's implementation is available in -`examples/salsa20/djb`, for the interested. The code for this verification task -can be found in the files named according to the pattern -`examples/salsa20/(s|S)alsa20.*`. - -### Specifications - -We now take on the actual verification task. This will be done in two stages: -We first define some useful utility functions for constructing common patterns -in the specifications for this type of program (i.e. one where the arguments to -functions are modified in-place.) We then demonstrate how one might construct a -specification for each of the functions in the Salsa20 implementation described -above. - -#### Utility Functions - -We first define the function -`alloc_init : LLVMType -> Term -> LLVMSetup SetupValue`. - -`alloc_init ty v` returns a `SetupValue` consisting of a pointer to memory -allocated and initialized to a value `v` of type `ty`. `alloc_init_readonly` -does the same, except the memory allocated cannot be written to. - -~~~~ -import "Salsa20.cry"; - -let alloc_init ty v = do { - p <- llvm_alloc ty; - llvm_points_to p (llvm_term v); - return p; -}; - -let alloc_init_readonly ty v = do { - p <- llvm_alloc_readonly ty; - llvm_points_to p (llvm_term v); - return p; -}; -~~~~ - -We now define -`ptr_to_fresh : String -> LLVMType -> LLVMSetup (Term, SetupValue)`. - -`ptr_to_fresh n ty` returns a pair `(x, p)` consisting of a fresh symbolic -variable `x` of type `ty` and a pointer `p` to it. `n` specifies the -name that SAW should use when printing `x`. `ptr_to_fresh_readonly` does the -same, but returns a pointer to space that cannot be written to. - -~~~~ -let ptr_to_fresh n ty = do { - x <- llvm_fresh_var n ty; - p <- alloc_init ty x; - return (x, p); -}; - -let ptr_to_fresh_readonly n ty = do { - x <- llvm_fresh_var n ty; - p <- alloc_init_readonly ty x; - return (x, p); -}; -~~~~ - -Finally, we define -`oneptr_update_func : String -> LLVMType -> Term -> LLVMSetup ()`. - -`oneptr_update_func n ty f` specifies the behavior of a function that takes -a single pointer (with a printable name given by `n`) to memory containing a -value of type `ty` and mutates the contents of that memory. The specification -asserts that the contents of this memory after execution are equal to the value -given by the application of `f` to the value in that memory before execution. - -~~~~ -let oneptr_update_func n ty f = do { - (x, p) <- ptr_to_fresh n ty; - llvm_execute_func [p]; - llvm_points_to p (llvm_term {{ f x }}); -}; -~~~~ - -#### The `quarterround` operation - -The C function we wish to verify has type -`void s20_quarterround(uint32_t *y0, uint32_t *y1, uint32_t *y2, uint32_t *y3)`. - -The function's specification generates four symbolic variables and pointers to -them in the precondition/setup stage. The pointers are passed to the function -during symbolic execution via `llvm_execute_func`. Finally, in the -postcondition/return stage, the expected values are computed using the trusted -Cryptol implementation and it is asserted that the pointers do in fact point to -these expected values. - -~~~~ -let quarterround_setup : LLVMSetup () = do { - (y0, p0) <- ptr_to_fresh "y0" (llvm_int 32); - (y1, p1) <- ptr_to_fresh "y1" (llvm_int 32); - (y2, p2) <- ptr_to_fresh "y2" (llvm_int 32); - (y3, p3) <- ptr_to_fresh "y3" (llvm_int 32); - - llvm_execute_func [p0, p1, p2, p3]; - - let zs = {{ quarterround [y0,y1,y2,y3] }}; // from Salsa20.cry - llvm_points_to p0 (llvm_term {{ zs@0 }}); - llvm_points_to p1 (llvm_term {{ zs@1 }}); - llvm_points_to p2 (llvm_term {{ zs@2 }}); - llvm_points_to p3 (llvm_term {{ zs@3 }}); -}; -~~~~ - -#### Simple Updating Functions - -The following functions can all have their specifications given by the utility -function `oneptr_update_func` implemented above, so there isn't much to say -about them. - -~~~~ -let rowround_setup = - oneptr_update_func "y" (llvm_array 16 (llvm_int 32)) {{ rowround }}; - -let columnround_setup = - oneptr_update_func "x" (llvm_array 16 (llvm_int 32)) {{ columnround }}; - -let doubleround_setup = - oneptr_update_func "x" (llvm_array 16 (llvm_int 32)) {{ doubleround }}; - -let salsa20_setup = - oneptr_update_func "seq" (llvm_array 64 (llvm_int 8)) {{ Salsa20 }}; -~~~~ - -#### 32-Bit Key Expansion - -The next function of substantial behavior that we wish to verify has the -following prototype: - -~~~~c -void s20_expand32( uint8_t *k - , uint8_t n[static 16] - , uint8_t keystream[static 64] - ) -~~~~ - -This function's specification follows a similar pattern to that of -`s20_quarterround`, though for extra assurance we can make sure that the -function does not write to the memory pointed to by `k` or `n` using the -utility `ptr_to_fresh_readonly`, as this function should only modify -`keystream`. Besides this, we see the call to the trusted Cryptol -implementation specialized to `a=2`, which does 32-bit key expansion (since the -Cryptol implementation can also specialize to `a=1` for 16-bit keys). This -specification can easily be changed to work with 16-bit keys. - -~~~~ -let salsa20_expansion_32 = do { - (k, pk) <- ptr_to_fresh_readonly "k" (llvm_array 32 (llvm_int 8)); - (n, pn) <- ptr_to_fresh_readonly "n" (llvm_array 16 (llvm_int 8)); - - pks <- llvm_alloc (llvm_array 64 (llvm_int 8)); - - llvm_execute_func [pk, pn, pks]; - - let rks = {{ Salsa20_expansion`{a=2}(k, n) }}; - llvm_points_to pks (llvm_term rks); -}; -~~~~ - -#### 32-bit Key Encryption - -Finally, we write a specification for the encryption function itself, which has -type - -~~~~c -enum s20_status_t s20_crypt32( uint8_t *key - , uint8_t nonce[static 8] - , uint32_t si - , uint8_t *buf - , uint32_t buflen - ) -~~~~ - -As before, we can ensure this function does not modify the memory pointed to by -`key` or `nonce`. We take `si`, the stream index, to be 0. The specification is -parameterized on a number `n`, which corresponds to `buflen`. Finally, to deal -with the fact that this function returns a status code, we simply specify that -we expect a success (status code 0) as the return value in the postcondition -stage of the specification. - -~~~~ -let s20_encrypt32 n = do { - (key, pkey) <- ptr_to_fresh_readonly "key" (llvm_array 32 (llvm_int 8)); - (v, pv) <- ptr_to_fresh_readonly "nonce" (llvm_array 8 (llvm_int 8)); - (m, pm) <- ptr_to_fresh "buf" (llvm_array n (llvm_int 8)); - - llvm_execute_func [ pkey - , pv - , llvm_term {{ 0 : [32] }} - , pm - , llvm_term {{ `n : [32] }} - ]; - - llvm_points_to pm (llvm_term {{ Salsa20_encrypt (key, v, m) }}); - llvm_return (llvm_term {{ 0 : [32] }}); -}; -~~~~ - -### Verifying Everything - -Finally, we can verify all of the functions. Notice the use of compositional -verification and that path satisfiability checking is enabled for those -functions with loops not bounded by explicit constants. Notice that we prove -the top-level function for several sizes; this is due to the limitation that -SAW can only operate on finite programs (while Salsa20 can operate on any input -size.) - -~~~~ -let main : TopLevel () = do { - m <- llvm_load_module "salsa20.bc"; - qr <- llvm_verify m "s20_quarterround" [] false quarterround_setup abc; - rr <- llvm_verify m "s20_rowround" [qr] false rowround_setup abc; - cr <- llvm_verify m "s20_columnround" [qr] false columnround_setup abc; - dr <- llvm_verify m "s20_doubleround" [cr,rr] false doubleround_setup abc; - s20 <- llvm_verify m "s20_hash" [dr] false salsa20_setup abc; - s20e32 <- llvm_verify m "s20_expand32" [s20] true salsa20_expansion_32 abc; - s20encrypt_63 <- llvm_verify m "s20_crypt32" [s20e32] true (s20_encrypt32 63) abc; - s20encrypt_64 <- llvm_verify m "s20_crypt32" [s20e32] true (s20_encrypt32 64) abc; - s20encrypt_65 <- llvm_verify m "s20_crypt32" [s20e32] true (s20_encrypt32 65) abc; - - print "Done!"; -}; -~~~~ - -[^4]: https://en.wikipedia.org/wiki/Salsa20 - -## Verifying Cryptol FFI functions - -SAW has special support for verifying the correctness of Cryptol's -[`foreign` -functions](https://galoisinc.github.io/cryptol/master/FFI.html), -implemented in a language such as C which compiles to LLVM, provided -that there exists a [reference Cryptol -implementation](https://galoisinc.github.io/cryptol/master/FFI.html#cryptol-implementation-of-foreign-functions) -of the function as well. Since the way in which `foreign` functions are -called is precisely specified by the Cryptol FFI, SAW is able to -generate a `LLVMSetup ()` spec directly from the type of a Cryptol -`foreign` function. This is done with the `llvm_ffi_setup` command, -which is experimental and requires `enable_experimental;` to be run -beforehand. -``` -llvm_ffi_setup : Term -> LLVMSetup () -``` - -For instance, for the simple imported Cryptol foreign function `foreign -add : [32] -> [32] -> [32]` we can obtain a `LLVMSetup` spec simply by -writing -``` -let add_setup = llvm_ffi_setup {{ add }}; -``` -which behind the scenes expands to something like -``` -let add_setup = do { - in0 <- llvm_fresh_var "in0" (llvm_int 32); - in1 <- llvm_fresh_var "in1" (llvm_int 32); - llvm_execute_func [llvm_term in0, llvm_term in1]; - llvm_return (llvm_term {{ add in0 in1 }}); -}; -``` - -### Polymorphism - -In general, Cryptol `foreign` functions can be polymorphic, with type -parameters of kind `#`, representing e.g. the sizes of input sequences. -However, any individual `LLVMSetup ()` spec only specifies the behavior -of the LLVM function on inputs of concrete sizes. We handle this by -allowing the argument term of `llvm_ffi_setup` to contain any necessary -type arguments in addition to the Cryptol function name, so that the -resulting term is monomorphic. The user can then define a parameterized -specification simply as a SAWScript function in the usual way. For -example, for a function `foreign f : {n, m} (fin n, fin m) => [n][32] -> -[m][32]`, we can obtain a parameterized `LLVMSetup` spec by -``` -let f_setup (n : Int) (m : Int) = llvm_ffi_setup {{ f`{n, m} }}; -``` -Note that the `Term` parameter that `llvm_ffi_setup` takes is restricted -syntactically to the format described above (``{{ fun`{tyArg0, tyArg1, -..., tyArgN} }}``), and cannot be any arbitrary term. - -### Supported types - -`llvm_ffi_setup` supports all Cryptol types that are supported by the -Cryptol FFI, with the exception of `Integer`, `Rational`, `Z`, and -`Float`. `Integer`, `Rational`, and `Z` are not supported since they are -translated to `gmp` arbitrary-precision types which are hard for SAW to -handle without additional overrides. There is no fundamental obstacle to -supporting `Float`, and in fact `llvm_ffi_setup` itself does work with -Cryptol floating point types, but the underlying functions such as -`llvm_fresh_var` do not, so until that is implemented `llvm_ffi_setup` -can generate a spec involving floating point types but it cannot -actually be run. - -### Performing the verification - -The resulting `LLVMSetup ()` spec can be used with the existing -`llvm_verify` function to perform the actual verification. And the -`LLVMSpec` output from that can be used as an override as usual for -further compositional verification. -``` -f_ov <- llvm_verify mod "f" [] true (f_setup 3 5) z3; -``` - -As with the Cryptol FFI itself, SAW does not manage the compilation of -the C source implementations of `foreign` functions to LLVM bitcode. For -the verification to be meaningful, is expected that the LLVM module -passed to `llvm_verify` matches the compiled dynamic library actually -used with the Cryptol interpreter. Alternatively, on x86_64 Linux, SAW -can perform verification directly on the `.so` ELF file with the -experimental `llvm_verify_x86` command. - -# Extraction to the Coq theorem prover - -In addition to the (semi-)automatic and compositional proof modes -already discussed above, SAW has experimental support for exporting -Cryptol and SAWCore values as terms to the Coq proof assistant[^5]. -This is intended to support more manual proof efforts for properties -that go beyond what SAW can support (for example, proofs requiring -induction) or for connecting to preexisting formalizations in Coq -of useful algorithms (e.g. the fiat crypto library[^6]). - -This support consists of two related pieces. The first piece is a library of -formalizations of the primitives underlying Cryptol and SAWCore and -various supporting concepts that help bridge the conceptual gap -between SAW and Coq. The second piece is a term translation -that maps the syntactic forms of SAWCore onto corresponding -concepts in Coq syntax, designed to dovetail with the concepts -defined in the support library. SAWCore is a quite similar language -to the core calculus underlying Coq, so much of this translation is -quite straightforward; however, the languages are not exactly -equivalent, and there are some tricky cases that mostly arise from -Cryptol code that can only be partially supported. We will note -these restrictions later in the manual. - -We expect this extraction process to work with a fairly wide range of Coq -versions, as we are not using bleeding-edge Coq features. It has been -most fully tested with Coq version 8.13.2. - -## Support Library - -In order to make use of SAW's extraction capabilities, one must first -compile the support library using Coq so that the included definitions -and theorems can be referenced by the extracted code. From the top -directory of the SAW source tree, the source code for this support -library can be found in the `saw-core-coq/coq` subdirectory. -In this subdirectory you will find a `_CoqProject` and a `Makefile`. -A simple `make` invocation should be enough to compile -all the necessary files, assuming Coq is installed and `coqc` is -available in the user's `PATH`. HTML documentation for the support -library can also be generated by `make html` from the same directory. - -Once the library is compiled, the recommended way to import -it into your subsequent development is by adding the following -lines to your `_CoqProject` file: - -``` --Q /saw-core-coq/coq/generated/CryptolToCoq CryptolToCoq --Q /saw-core-coq/coq/handwritten/CryptolToCoq CryptolToCoq -``` - -Here `` refers to the location on your system where the -SAWScript source tree is checked out. This will add the relevant -library files to the `CryptolToCoq` namespace, where the extraction -process will expect to find them. - -The support library for extraction is broken into two parts: those -files which are handwritten, versus those that are automatically -generated. The handwritten files are generally fairly readable and -are reasonable for human inspection; they define most of the -interesting pipe-fitting that allows Cryptol and SAWCore definitions -to connect to corresponding Coq concepts. In particular the -file `SAWCoreScaffolding.v` file defines most of the bindings of base -types to Coq types, and the `SAWCoreVectorsAsCoqVectors.v` defines the -core bitvector operations. The automatically generated files are direct -translations of the SAWCore source files -(`saw-core/prelude/Prelude.sawcore` and -`cryptol-saw-core/saw/Cryptol.sawcore`) that correspond to the -standard libraries for SAWCore and Cryptol, respectively. - -The autogenerated files are intended to be kept up-to-date with -changes in the corresponding `sawcore` files, and end users should -not need to generate them. Nonetheless, if they are out of sync for some -reason, these files may be regenerated using the `saw` executable -by running `(cd saw-core-coq; saw saw/generate_scaffolding.saw)` -from the top-level of the SAW source directory before compiling them -with Coq as described above. - -You may also note some additional files and concepts in the standard -library, such as `CompM.v`, and a variety of lemmas and definitions -related to it. These definitions are related to the "heapster" system, -which form a separate use-case for the SAWCore to Coq translation. -These definitions will not be used for code extracted from Cryptol. - -## Cryptol module extraction - -There are several modes of use for the SAW to Coq term extraction -facility, but the easiest to use is whole Cryptol module extraction. -This will extract all the definitions in the given Cryptol module, -together with it's transitive dependencies, into a single Coq module -which can then be compiled and pulled into subsequence developments. - -Suppose we have a Cryptol source file named `source.cry` and we want -to generate a Coq file named `output.v`. We can accomplish this by -running the following commands in saw (either directly from the `saw` -command prompt, or via a script file) - -``` -enable_experimental; -write_coq_cryptol_module "source.cry" "output.v" [] []; -``` - -In this default mode, identifiers in the Cryptol source will be -directly translated into identifiers in Coq. This may occasionally -cause problems if source identifiers clash with Coq keywords or -preexisting definitions. The third argument to -`write_coq_cryptol_module` can be used to remap such names if -necessary by giving a list of `(in,out)` pairs of names. The fourth -argument is a list of source identifiers to skip translating, if -desired. Authoritative online documentation for this command can be -obtained directly from the `saw` executable via `:help -write_coq_cryptol_module` after `enable_experimental`. - -The resulting "output.v" file will have some of the usual hallmarks of -computer-generated code; it will have poor formatting and, explicit -parenthesis and fully-qualified names. Thankfully, once loaded into -Coq, the Coq pretty-printer will do a much better job of rendering -these terms in a somewhat human-readable way. - -## Proofs involving uninterpreted functions - -It is possible to write a Cryptol module that references uninterpreted -functions by using the `primitive` keyword to declare them in your -Cryptol source. Primitive Cryptol declarations will be translated into -Coq section variables; as usual in Coq, uses of these section -variables will be translated into additional parameters to the -definitions from outside the section. In this way, consumers of -the translated module can instantiate the declared Cryptol functions -with corresponding terms in subsequent Coq developments. - -Although the Cryptol interpreter itself will not be able to compute -with declared but undefined functions of this sort, they can be used -both to provide specifications for functions to be verified with -`llvm_verify` or `jvm_verify` and also for Coq extraction. - -For example, if I write the following Cryptol source file: - -``` -primitive f : Integer -> Integer - -g : Integer -> Bool -g x = f (f x) > 0 -``` - -After extraction, the generated term `g` will have Coq type: - -``` -(Integer -> Integer) -> Integer -> Bool -``` - -## Translation limitations and caveats - -Translation from Cryptol to Coq has a number of fundamental -limitations that must be addressed. The most severe of these is that -Cryptol is a fully general-recursive language, and may exhibit runtime -errors directly via calls to the `error` primitive, or via partial -operations (such as indexing a sequence out-of-bounds). The internal -language of Coq, by contrast, is a strongly-normalizing language of -total functions. As such, our translation is unable to extract all -Cryptol programs. - -### Recursive programs - -The most severe of the currently limitations for our system -is that the translation is unable to translate any recursive Cryptol -program. Doing this would require us to attempt to find some -termination argument for the recursive function sufficient to satisfy -Coq; for now, no attempt is made to do so. if you attempt to -extract a recursive function, SAW will produce an error -about a "malformed term" with `Prelude.fix` as the head symbol. - -Certain limited kinds of recursion are available via the -`foldl` Cryptol primitive operation, which is translated directly -into a fold operation in Coq. This is sufficient for many basic -iterative algorithms. - -### Type coercions - -Another limitation of the translation system is that Cryptol uses SMT -solvers during its typechecking process and uses the results of solver -proofs to justify some of its typing judgments. When extracting these -terms to Coq, type equality coercions must be generated. Currently, we -do not have a good way to transport the reasoning done inside -Cryptol's typechecker into Coq, so we just supply a very simple `Ltac` -tactic to discharge these coercions (see `solveUnsafeAssert` in -`CryptolPrimitivesForSAWCoreExtra.v`). This tactic is able to -discover simple coercions, but for anything nontrivial it may -fail. The result will be a typechecking failure when compiling the -generated code in Coq when the tactic fails. If you encounter this -problem, it may be possible to enhance the `solveUnsafeAssert` tactic -to cover your use case. - -### Error terms - -A final caveat that is worth mentioning is that Cryptol can sometimes -produce runtime errors. These can arise from explicit calls to the -`error` primitive, or from partially defined operations (e.g., -division by zero or sequence access out of bounds). Such instances -are translated to occurrences of an unrealized Coq axiom named `error`. -In order to avoid introducing an inconsistent environment, the `error` -axiom is restricted to apply only to inhabited types. All the types -arising from Cryptol programs are inhabited, so this is no problem -in principle. However, collecting and passing this information around -on the Coq side is a little tricky. - -The main technical device we use here is the `Inhabited` type class; -it simply asserts that a type has some distinguished inhabitant. We -provide instances for the base types and type constructors arising -from Cryptol, so the necessary instances ought to be automatically -constructed when needed. However, polymorphic Cryptol functions add -some complications, as type arguments must also come together with -evidence that they are inhabited. The translation process takes care -to add the necessary `Inhabited` arguments, so everything ought to -work out. However, if Coq typechecking of generated code fails with -errors about `Inhabited` class instances, it likely represents some -problem with this aspect of the translation. - - -[^5]: https://coq.inria.fr -[^6]: https://github.com/mit-plv/fiat-crypto - -# Analyzing Hardware Circuits using Yosys -SAW has experimental support for analysis of hardware descriptions written in VHDL ([via GHDL](https://github.com/ghdl/ghdl-yosys-plugin)) through an intermediate representation produced by [Yosys](https://yosyshq.net/yosys/). -This generally follows the same conventions and idioms used in the rest of SAWScript. - -## Processing VHDL With Yosys -Given a VHDL file `test.vhd` containing an entity `test`, one can generate an intermediate representation `test.json` suitable for loading into SAW: - -~~~~ -$ ghdl -a test.vhd -$ yosys -... -Yosys 0.10+1 (git sha1 7a7df9a3b4, gcc 10.3.0 -fPIC -Os) -yosys> ghdl test - -1. Executing GHDL. -Importing module test. - -yosys> write_json test.json - -2. Executing JSON backend. -~~~~ - -It can sometimes be helpful to invoke additional Yosys passes between the `ghdl` and `write_json` commands. -For example, at present SAW does not support the `$pmux` cell type. -Yosys is able to convert `$pmux` cells into trees of `$mux` cells using the `pmuxtree` command. -We expect there are many other situations where Yosys' considerable library of commands is valuable for pre-processing. - -## Example: Ripple-Carry Adder -Consider three VHDL entities. -First, a half-adder: - -~~~~vhdl -library ieee; -use ieee.std_logic_1164.all; - -entity half is - port ( - a : in std_logic; - b : in std_logic; - c : out std_logic; - s : out std_logic - ); -end half; - -architecture halfarch of half is -begin - c <= a and b; - s <= a xor b; -end halfarch; -~~~~ - -Next, a one-bit adder built atop that half-adder: - -~~~~vhdl -library ieee; -use ieee.std_logic_1164.all; - -entity full is - port ( - a : in std_logic; - b : in std_logic; - cin : in std_logic; - cout : out std_logic; - s : out std_logic - ); -end full; - -architecture fullarch of full is - signal half0c : std_logic; - signal half0s : std_logic; - signal half1c : std_logic; -begin - half0 : entity work.half port map (a => a, b => b, c => half0c, s => half0s); - half1 : entity work.half port map (a => half0s, b => cin, c => half1c, s => s); - cout <= half0c or half1c; -end fullarch; -~~~~ - -Finally, a four-bit adder: - -~~~~vhdl -library ieee; -use ieee.std_logic_1164.all; - -entity add4 is - port ( - a : in std_logic_vector(0 to 3); - b : in std_logic_vector(0 to 3); - res : out std_logic_vector(0 to 3) - ); -end add4; - -architecture add4arch of add4 is - signal full0cout : std_logic; - signal full1cout : std_logic; - signal full2cout : std_logic; - signal ignore : std_logic; -begin - full0 : entity work.full port map (a => a(0), b => b(0), cin => '0', cout => full0cout, s => res(0)); - full1 : entity work.full port map (a => a(1), b => b(1), cin => full0cout, cout => full1cout, s => res(1)); - full2 : entity work.full port map (a => a(2), b => b(2), cin => full1cout, cout => full2cout, s => res(2)); - full3 : entity work.full port map (a => a(3), b => b(3), cin => full2cout, cout => ignore, s => res(3)); -end add4arch; -~~~~ - -Using GHDL and Yosys, we can convert the VHDL source above into a format that SAW can import. -If all of the code above is in a file `adder.vhd`, we can run the following commands: - -~~~~ -$ ghdl -a adder.vhd -$ yosys -p 'ghdl add4; write_json adder.json' -~~~~ - -The produced file `adder.json` can then be loaded into SAW with `yosys_import`: - -~~~~ -$ saw -... -sawscript> enable_experimental -sawscript> m <- yosys_import "adder.json" -sawscript> :type m -Term -sawscript> type m -[23:57:14.492] {add4 : {a : [4], b : [4]} -> {res : [4]}, - full : {a : [1], b : [1], cin : [1]} -> {cout : [1], s : [1]}, - half : {a : [1], b : [1]} -> {c : [1], s : [1]}} -~~~~ - -`yosys_import` returns a `Term` with a Cryptol record type, where the fields correspond to each VHDL module. -We can access the fields of this record like we would any Cryptol record, and call the functions within like any Cryptol function. - -~~~~ -sawscript> type {{ m.add4 }} -[00:00:25.255] {a : [4], b : [4]} -> {res : [4]} -sawscript> eval_int {{ (m.add4 { a = 1, b = 2 }).res }} -[00:02:07.329] 3 -~~~~ - -We can also use all of SAW's infrastructure for asking solvers about `Term`s, such as the `sat` and `prove` commands. -For example: - -~~~~ -sawscript> sat w4 {{ m.add4 === \_ -> { res = 5 } }} -[00:04:41.993] Sat: [_ = (5, 0)] -sawscript> prove z3 {{ m.add4 === \inp -> { res = inp.a + inp.b } }} -[00:05:43.659] Valid -sawscript> prove yices {{ m.add4 === \inp -> { res = inp.a - inp.b } }} -[00:05:56.171] Invalid: [_ = (8, 13)] -~~~~ - -The full library of `ProofScript` tactics is available in this setting. -If necessary, proof tactics like `simplify` can be used to rewrite goals before querying a solver. - -Special support is provided for the common case of equivalence proofs between HDL modules and other `Term`s (e.g. Cryptol functions, other HDL modules, or "extracted" imperative LLVM or JVM code). -The command `yosys_verify` has an interface similar to `llvm_verify`: given a specification, some lemmas, and a proof tactic, it produces evidence of a proven equivalence that may be passed as a lemma to future calls of `yosys_verify`. -For example, consider the following Cryptol specifications for one-bit and four-bit adders: - -~~~~cryptol -cryfull : {a : [1], b : [1], cin : [1]} -> {cout : [1], s : [1]} -cryfull inp = { cout = [cout], s = [s] } - where [cout, s] = zext inp.a + zext inp.b + zext inp.cin - -cryadd4 : {a : [4], b : [4]} -> {res : [4]} -cryadd4 inp = { res = inp.a + inp.b } -~~~~ - -We can prove equivalence between `cryfull` and the VHDL `full` module: - -~~~~ -sawscript> full_spec <- yosys_verify {{ m.full }} [] {{ cryfull }} [] w4; -~~~~ - -The result `full_spec` can then be used as an "override" when proving equivalence between `cryadd4` and the VHDL `add4` module: - -~~~~ -sawscript> add4_spec <- yosys_verify {{ m.add4 }} [] {{ cryadd4 }} [full_spec] w4; -~~~~ - -The above could also be accomplished through the use of `prove_print` and term rewriting, but it is much more verbose. - -`yosys_verify` may also be given a list of preconditions under which the equivalence holds. -For example, consider the following Cryptol specification for `full` that ignores the `cin` bit: - -~~~~cryptol -cryfullnocarry : {a : [1], b : [1], cin : [1]} -> {cout : [1], s : [1]} -cryfullnocarry inp = { cout = [cout], s = [s] } - where [cout, s] = zext inp.a + zext inp.b -~~~~ - -This is not equivalent to `full` in general, but it is if constrained to inputs where `cin = 0`. -We may express that precondition like so: - -~~~~ -sawscript> full_nocarry_spec <- yosys_verify {{ adderm.full }} [{{\(inp : {a : [1], b : [1], cin : [1]}) -> inp.cin == 0}}] {{ cryfullnocarry }} [] w4; -~~~~ - -The resulting override `full_nocarry_spec` may still be used in the proof for `add4` (this is accomplished by rewriting to a conditional expression). - -## API Reference -N.B: The following commands must first be enabled using `enable_experimental`. - -* `yosys_import : String -> TopLevel Term` produces a `Term` given the path to a JSON file produced by the Yosys `write_json` command. - The resulting term is a Cryptol record, where each field corresponds to one HDL module exported by Yosys. - Each HDL module is in turn represented by a function from a record of input port values to a record of output port values. - For example, consider a Yosys JSON file derived from the following VHDL entities: - ~~~~vhdl - entity half is - port ( - a : in std_logic; - b : in std_logic; - c : out std_logic; - s : out std_logic - ); - end half; - - entity full is - port ( - a : in std_logic; - b : in std_logic; - cin : in std_logic; - cout : out std_logic; - s : out std_logic - ); - end full; - ~~~~ - The resulting `Term` will have the type: - ~~~~ - { half : {a : [1], b : [1]} -> {c : [1], s : [1]} - , full : {a : [1], b : [1], cin : [1]} -> {cout : [1], s : [1]} - } - ~~~~ -* `yosys_verify : Term -> [Term] -> Term -> [YosysTheorem] -> ProofScript () -> TopLevel YosysTheorem` proves equality between an HDL module and a specification. - The first parameter is the HDL module - given a record `m` from `yosys_import`, this will typically look something like `{{ m.foo }}`. - The second parameter is a list of preconditions for the equality. - The third parameter is the specification, a term of the same type as the HDL module, which will typically be some Cryptol function or another HDL module. - The fourth parameter is a list of "overrides", which witness the results of previous `yosys_verify` proofs. - These overrides can be used to simplify terms by replacing use sites of submodules with their specifications. - - Note that `Term`s derived from HDL modules are "first class", and are not restricted to `yosys_verify`: they may also be used with SAW's typical `Term` infrastructure like `sat`, `prove_print`, term rewriting, etc. - `yosys_verify` simply provides a convenient and familiar interface, similar to `llvm_verify` or `jvm_verify`. - -# Bisimulation Prover - -SAW contains a bisimulation prover to prove that two terms simulate each other. -This prover allows users to prove that two terms executing in lockstep satisfy -some relations over the state of each circuit and their outputs. This type of -proof is useful in demonstrating the eventual equivalence of two circuits, or -of a circuit and a functional specification. SAW enables these proofs with the -experimental `prove_bisim` command: - -~~~~ -prove_bisim : ProofScript () -> [BisimTheorem] -> Term -> Term -> Term -> Term -> TopLevel BisimTheorem -~~~~ - -When invoking `prove_bisim strat theorems srel orel lhs rhs`, the arguments -represent the following: - -1. `strat`: A proof strategy to use during verification. -2. `theorems`: A list of already proven bisimulation theorems. -3. `srel`: A state relation between `lhs` and `rhs`. This relation must have - the type `lhsState -> rhsState -> Bit`. The relation's first argument is - `lhs`'s state prior to execution. The relation's second argument is `rhs`'s - state prior to execution. `srel` then returns a `Bit` indicating whether - the two arguments satisfy the bisimulation's state relation. -4. `orel`: An output relation between `lhs` and `rhs`. This relation must have - the type `(lhsState, output) -> (rhsState, output) -> Bit`. The relation's - first argument is a pair consisting of `lhs`'s state and output following - execution. The relation's second argument is a pair consisting of `rhs`'s - state and output following execution. `orel` then returns a `Bit` indicating - whether the two arguments satisfy the bisimulation's output relation. -5. `lhs`: A term that simulates `rhs`. `lhs` must have the type - `(lhsState, input) -> (lhsState, output)`. The first argument to `lhs` is a - tuple containing the internal state of `lhs`, as well as the input to `lhs`. - `lhs` returns a tuple containing its internal state after execution, as well - as its output. -6. `rhs`: A term that simulates `lhs`. `rhs` must have the type - `(rhsState, input) -> (rhsState, output)`. The first argument to `rhs` is a - tuple containing the internal state of `rhs`, as well as the input to `rhs`. - `rhs` returns a tuple containing its internal state after execution, as well - as its output. - -On success, `prove_bisim` returns a `BisimTheorem` that can be used in future -bisimulation proofs to enable compositional bisimulation proofs. On failure, -`prove_bisim` will abort. - -## Bisimulation Example - -This section walks through an example proving that the Cryptol implementation -of an AND gate that makes use of internal state and takes two cycles to -complete is equivalent to a pure function that computes the logical AND of its -inputs in one cycle. First, we define the implementation's state type: - -~~~~ -type andState = { loaded : Bit, origX : Bit, origY : Bit } -~~~~ - -`andState` is a record type with three fields: - -1. `loaded`: A `Bit` indicating whether the input to the AND gate has been - loaded into the state record. -2. `origX`: A `Bit` storing the first input to the AND gate. -3. `origY`: A `Bit` storing the second input to the AND gate. - -Now, we define the AND gate's implementation: - -~~~~ -andImp : (andState, (Bit, Bit)) -> (andState, (Bit, Bit)) -andImp (s, (x, y)) = - if s.loaded /\ x == s.origX /\ y == s.origY - then (s, (True, s.origX && s.origY)) - else ({ loaded = True, origX = x, origY = y }, (False, 0)) -~~~~ - -`andImp` takes a tuple as input where the first field is an `andState` holding -the gate's internal state, and second field is a tuple containing the inputs to -the AND gate. `andImp` returns a tuple consisting of the updated `andState` and -the gate's output. The output is a tuple where the first field is a ready bit -that is `1` when the second field is ready to be read, and the second field -is the result of gate's computation. - -`andImp` takes two cycles to complete: - -1. The first cycle loads the inputs into its state's `origX` and `origY` fields - and sets `loaded` to `True`. It sets both of its output bits to `0`. -2. The second cycle uses the stored input values to compute the logical AND. - It sets its ready bit to `1` and its second output to the logical AND - result. - -So long as the inputs remain fixed after the second cycle, `andImp`'s output -remains unchanged. If the inputs change, then `andImp` restarts the -computation (even if the inputs change between the first and second cycles). - -Next, we define the pure function we'd like to prove `andImp` bisimilar to: - -~~~~ -andSpec : ((), (Bit, Bit)) -> ((), (Bit, Bit)) -andSpec (_, (x, y)) = ((), (True, x && y)) -~~~~ - -`andSpec` takes a tuple as input where the first field is `()`, indicating that -`andSpec` is a pure function without internal state, and the second field is a -tuple containing the inputs to the AND function. `andSpec` returns a tuple -consisting of `()` (again, because `andSpec` is stateless) and the function's -output. Like `andImp`, the output is a tuple where the first field is a ready -bit that is `1` when the second field is ready to be read, and the second field -is the result of the function's computation. - -`andSpec` completes in a single cycle, and as such its ready bit is always `1`. -It computes the logical AND directly on the function's inputs using Cryptol's -`(&&)` operator. - -Next, we define a state relation over `andImp` and `andSpec`: - -~~~~ -andStateRel : andState -> () -> Bit -andStateRel _ () = True -~~~~ - -`andStateRel` takes two arguments: - -1. An `andState` for `andImp`. -2. An empty state (`()`) for `andSpec`. - -`andStateRel` returns a `Bit` indicating whether the relation is satisified. In -this case, `andStateRel` always returns `True` because `andSpec` is stateless -and therefore the state relation permits `andImp` to accept any state. - -Lastly, we define a relation over `andImp` and `andSpec`: - -~~~~ -andOutputRel : (andState, (Bit, Bit)) -> ((), (Bit, Bit)) -> Bit -andOutputRel (s, (impReady, impO)) ((), (_, specO)) = - if impReady then impO == specO else True -~~~~ - -`andOutputRel` takes two arguments: - -1. A return value from `andImp`. Specifically, a pair consisting of an - `andState` and a pair containing a ready bit and result of the logical AND. -2. A return value from `andSpec`. Specifically, a pair consisting of an empty - state `()` and a pair containing a ready bit and result of the logical AND. - -`andOutputRel` returns a `Bit` indicating whether the relation is satisfied. It -considers the relation satisfied in two ways: - -1. If `andImp`'s ready bit is set, the relation is satisfied if the output - values `impO` and `specO` from `andImp` and `andSpec` respectively are - equivalent. -2. If `andImp`'s ready bit is not set, the relation is satisfied. - -Put another way, the relation is satisfied if the end result of `andImp` and -`andSpec` are equivalent. The relation permits intermediate outputs to differ. - -We can verify that this relation is always satisfied--and therefore the two -terms are bisimilar--by using `prove_bisim`: - -~~~~ -import "And.cry"; -enable_experimental; - -and_bisim <- prove_bisim z3 [] {{ andStateRel }} {{ andOutputRel }} {{ andImp }} {{ andSpec }}; -~~~~ - -Upon running this script, SAW prints: - -~~~~ -Successfully proved bisimulation between andImp and andSpec -~~~~ - -### Building a NAND gate - -We can make the example more interesting by reusing components to build a NAND -gate. We first define a state type for the NAND gate implementation that -contains `andImp`'s state. This NAND gate will not need any additional state, -so we will define a type `nandState` that is equal to `andState`: - -~~~~ -type nandState = andState -~~~~ - -Now, we define an implementation `nandImp` that calls `andImp` and negates the -result: - -~~~~ -nandImp : (nandState, (Bit, Bit)) -> (nandState, (Bit, Bit)) -nandImp x = (s, (andReady, ~andRes)) - where - (s, (andReady, andRes)) = andImp x -~~~~ - -Note that `nandImp` is careful to preserve the ready status of `andImp`. -Because `nandImp` relies on `andImp`, it also takes two cycles to compute the -logical NAND of its inputs. - -Next, we define a specification `nandSpec` in terms of `andSpec`: - -~~~~ -nandSpec : ((), (Bit, Bit)) -> ((), (Bit, Bit)) -nandSpec (_, (x, y)) = ((), (True, ~ (andSpec ((), (x, y))).1.1)) -~~~~ - -As with `andSpec`, `nandSpec` is pure and computes its result in a single -cycle. - -Next, we define a state relation over `nandImp` and `nandSpec`: - -~~~~ -nandStateRel : andState -> () -> Bit -nandStateRel _ () = True -~~~~ - -As with `andStateRel`, this state relation is always `True` because `nandSpec` -is stateless. - -Lastly, we define an output relation indicating that `nandImp` and `nandSpec` -produce equivalent results once `nandImp`'s ready bit is `1`: - -~~~~ -nandOutputRel : (nandState, (Bit, Bit)) -> ((), (Bit, Bit)) -> Bit -nandOutputRel (s, (impReady, impO)) ((), (_, specO)) = - if impReady then impO == specO else True -~~~~ - -To prove that `nandImp` and `nandSpec` are bisimilar, we again use -`prove_bisim`. This time however, we can reuse the bisimulation proof for the -AND gate by including it in the `theorems` paramter for `prove_bisim`: - -~~~~ -prove_bisim z3 [and_bisim] {{ nandStateRel }} {{ nandOutputRel }} {{ nandImp }} {{ nandSpec }}; -~~~~ - -Upon running this script, SAW prints: - -~~~~ -Successfully proved bisimulation between nandImp and nandSpec -~~~~ - -## Understanding the proof goals - -While not necessary for simple proofs, more advanced proofs may require -inspecting proof goals. `prove_bisim` generates and attempts to solve the -following proof goals: - -~~~~ -OUTPUT RELATION THEOREM: - forall s1 s2 in. - srel s1 s2 -> orel (lhs (s1, in)) (rhs (s2, in)) - -STATE RELATION THEOREM: - forall s1 s2 out1 out2. - orel (s1, out1) (s2, out2) -> srel s1 s2 -~~~~ - -where the variables in the `forall`s are: - -* `s1`: Initial state for `lhs` -* `s2`: Initial state for `rhs` -* `in`: Input value to `lhs` and `rhs` -* `out1`: Initial output value for `lhs` -* `out2`: Initial output value for `rhs` - -The `STATE RELATION THEOREM` verifies that the output relation properly captures -the guarantees of the state relation. The `OUTPUT RELATION THEOREM` verifies -that if `lhs` and `rhs` are executed with related states, then the result of -that execution is also related. These two theorems together guarantee that the -terms simulate each other. - -When using composition, `prove_bisim` also generates and attempts to solve -the proof goal below for any successfully applied `BisimTheorem` in the -`theorems` list: - -~~~~ -COMPOSITION SIDE CONDITION: - forall g_lhs_s g_rhs_s. - g_srel g_lhs_s g_rhs_s -> f_srel f_lhs_s f_rhs_s - where - f_lhs_s = extract_inner_state g_lhs g_lhs_s f_lhs - f_rhs_s = extract_inner_state g_rhs g_rhs_s f_rhs -~~~~ - -where `g_lhs` is an outer term containing a call to an inner term `f_lhs` -represented by a `BisimTheorem` and `g_rhs` is an outer term containing a call -to an inner term `f_rhs` represented by the same `BisimTheorem`. The variables -in `COMPOSITION SIDE CONDITION` are: - -* `extract_inner_state x x_s y`: A helper function that takes an outer term `x`, an - outer state `x_s`, and an inner term `y`, and returns the inner state of `x_s` - that `x` passes to `y`. -* `g_lhs_s`: The state for `g_lhs` -* `g_rhs_s`: The state for `g_rhs` -* `g_srel`: The state relation for `g_lhs` and `g_rhs` -* `f_srel`: The state relation for `f_lhs` and `f_rhs` -* `f_lhs_s`: The state for `f_lhs`, as represented in `g_lhs_s` (extracted using - `extract_inner_state`). -* `f_rhs_s`: The state for `f_rhs`, as represented in `g_rhs_s` (extracted using - `extract_inner_state`). - -The `COMPOSITION SIDE CONDITION` exists to verify that the terms in the -bisimulation relation properly set up valid states for subterms they contain. - -## Limitations - -For now, the `prove_bisim` command has a couple limitations: - -* `lhs` and `rhs` must be named functions. This is because `prove_bisim` uses - these names to perform substitution when making use of compositionality. -* Each subterm present in the list of bisimulation theorems already - proven may be invoked at most once in `lhs` or `rhs`. That is, if some - function `g_lhs` calls `f_lhs`, and `prove_bisim` is invoked with a - `BisimTheorem` proving that `f_lhs` is bisimilar to `f_rhs`, then `g_lhs` may - call `f_lhs` at most once. - -# Formal Deprecation Process - -SAW primitives, and thus their associated SAWScript built-ins, sometimes -become obsolete or are found inadequate and replaced. -The process by which that happens has three steps, as follows: - -1. The decision is made to deprecate and eventually remove the objects -in question. -This can happen at the level of individual built-in elements (for example, -when replacing a function with an awkward interface or unfortunate name) or -at the level of internal units of functionality with possibly multiple -built-ins affected. -At this step the built-ins in question are marked for a deprecation warning. -They remain available by default, but referring to them will trigger a -warning. - -2. The objects in question are made invisible by default. -Now, referring to the affected built-ins will fail unless the -`enable_deprecated` command is used. -In that case referring to them will still produce a warning. - -3. The objects in question are removed entirely and are no longer -available. - -In general any object or group of objects will move only one step per -release; that is, something first marked deprecated (so it warns) in -saw-script 1.2 will not disappear by default before saw-script 1.3 and -not be removed entirely before saw-script 1.4. -The time frame may be longer depending on the needs of downstream -users, the complexity of migration, and the cost/impact of keeping the -deprecated code in the system. - -We may move faster if circumstances dictate, but hope not to need to. - -Objects that have never appeared in a release, or that have never -moved past experimental may be removed without first being deprecated. -However, we aim to avoid this in cases where the objects in question -have gotten substantial use despite their formal status. diff --git a/doc/manual/manual.pdf b/doc/manual/manual.pdf deleted file mode 100644 index 671ec30487..0000000000 Binary files a/doc/manual/manual.pdf and /dev/null differ diff --git a/doc/pdfs/llvm-java-verification-with-saw.pdf b/doc/pdfs/llvm-java-verification-with-saw.pdf new file mode 100644 index 0000000000..287e2284f1 Binary files /dev/null and b/doc/pdfs/llvm-java-verification-with-saw.pdf differ diff --git a/doc/pdfs/rust-verification-with-saw.pdf b/doc/pdfs/rust-verification-with-saw.pdf new file mode 100644 index 0000000000..19ee8f9fa7 Binary files /dev/null and b/doc/pdfs/rust-verification-with-saw.pdf differ diff --git a/doc/pdfs/saw-user-manual.pdf b/doc/pdfs/saw-user-manual.pdf new file mode 100644 index 0000000000..0b9c718463 Binary files /dev/null and b/doc/pdfs/saw-user-manual.pdf differ diff --git a/doc/rust-tutorial/Makefile b/doc/rust-tutorial/Makefile deleted file mode 100644 index 32ec78184d..0000000000 --- a/doc/rust-tutorial/Makefile +++ /dev/null @@ -1,66 +0,0 @@ -TARGET = tmp/sawScriptRustTutorial -SRCS = ${TARGET}.md ${wildcard *.bib} ${wildcard *.cls} ${wildcard *.sty} -CODE = ${wildcard code/*.c} \ - ${wildcard code/*.cry} \ - ${wildcard code/*.java} \ - ${wildcard code/*.saw} -TARBALL = tmp/saw-rust-tutorial-code.tar.gz -SPELLSRC = ${TARGET}.tex -NEWSPELL = ${TARGET}.SPELLNEW -OLDSPELL = ${TARGET}.SPELLOLD -SPELL = aspell -t -l -AUX = ${wildcard *.blg} ${wildcard *.bbl} ${wildcard *.aux} \ - ${wildcard *.eps} ${wildcard *.log} ${wildcard *.toc} \ - ${TARGET}.md -PDFARGS = -H rust-tutorial-head.tex \ - -B rust-tutorial-before.tex \ - -A rust-tutorial-after.tex \ - --toc \ - --listings \ - -V documentclass:galois-whitepaper \ - -V fontsize:12 \ - --pdf-engine=xelatex -HTMLARGS = --css doc.css \ - -B rust-tutorial-before.html \ - --toc \ - --standalone \ - --metadata title="SAWScript Rust Tutorial" \ - --self-contained - -all: ${TARGET}.pdf ${TARGET}.html ${TARBALL} - -${TARGET}.pdf: ${SRCS} Makefile | tmp - pandoc ${PDFARGS} -o $@ ${TARGET}.md - -${TARGET}.html: ${SRCS} Makefile | tmp - pandoc ${HTMLARGS} -o $@ ${TARGET}.md - -${TARBALL}: ${CODE} - tar czf ${TARBALL} code - -# Pre-processing step. Right now, does nothing. -${TARGET}.md: rust-tutorial.md docode.hs ${CODE} | tmp - runhaskell docode.hs < $< > $@ - -docode: ${TARGET}.md | tmp - -.PHONY: spellClean superClean clean quickSpell - -tmp: - mkdir -p tmp - -clean: - -rm -f ${AUX} - -superClean: clean - -rm -f ${TARGET}.pdf ${TARGET}.html ${TARGET}.md tmp - -spellClean: - rm -f ${NEWSPELL} ${OLDSPELL} - -quickSpell: - @touch ${NEWSPELL} - @mv -f ${NEWSPELL} ${OLDSPELL} - @cat ${SPELLSRC} | ${SPELL} | tr "A-Z" "a-z" | sort | uniq | less > ${NEWSPELL} - @echo '(The ones marked with < are new.)' - @diff ${NEWSPELL} ${OLDSPELL} diff --git a/doc/rust-tutorial/README.md b/doc/rust-tutorial/README.md deleted file mode 100644 index c3c5147261..0000000000 --- a/doc/rust-tutorial/README.md +++ /dev/null @@ -1,8 +0,0 @@ -To update the tutorial, edit `rust-tutorial.md` and then run `make`. The output -is generated in `./tmp`. If you want to version a new copy of the tutorial, -copy `./tmp/sawScriptRustTutorial.pdf` to `./sawScriptRustTutorial.pdf` and -commit. - -Note that some uses of the `^` character in this tutorial cause problems with -especially old versions of `pandoc`. I have confirmed that the tutorial builds -successfully with `pandoc 2.17.1.1` or later. diff --git a/doc/rust-tutorial/doc.css b/doc/rust-tutorial/doc.css deleted file mode 100644 index 08327aa2df..0000000000 --- a/doc/rust-tutorial/doc.css +++ /dev/null @@ -1,93 +0,0 @@ -body { - font-size: 14px; - line-height: 21px; - color: #333; - font-family: "Helvetica Neue", "Lucida Grande", "Lucida Sans Unicode", Helvetica, Arial, sans-serif !important; -} -.container { - width: 950px; - margin: 0; - padding: 80px 0px 50px 50px; -} -p, li { - width: 625px; -} -a { - color: #191933; -} -h1, h2, h3, h4, h5, h6, b.header { - color: #000; - margin-top: 40px; - margin-bottom: 15px; - text-shadow: #fff 0 1px 1px; -} -h1 { font-size: 18px } -h2 { font-size: 16px } -h3, h4, h5, h6 { font-size: 14px } -.author { font-weight: normal } - -br.clear { - height: 0; - clear: both; -} -ul { - padding-left: 20px; -} -b.header { - display: block; -} -li { - margin-bottom: 10px; -} -code, pre, tt, textarea { - font-family: Monaco, Consolas, "Lucida Console", monospace; - font-size: 12px; - line-height: 18px; - color: #155; - white-space: pre-wrap; - word-wrap: break-word; -} - tt { - display: inline-block; - background: #fff; - border: 1px solid #dedede; - padding: 0px 0.2em; - } - pre { - border-left: 5px solid rgba(0,0,0,0.2); - padding: 3px 0 3px 12px; - font-size: 12px; - } - pre.no_bar { - border-left: 0; - margin-left: 0; - padding-left: 0; - } -div.code { - position: relative; - background: #fff; - border: 1px solid #d8d8d8; - -webkit-box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - -moz-box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - zoom: 1; -} - div.code { - text-transform: none; - position: absolute; - right: 8px; bottom: 8px; - } - div.code { - left: 8px; right: auto; - } - div.code pre, div.code textarea { - float: left; - width: 450px; - background: #fff; - border-left: 1px dotted #d0d0d0; - margin: 10px 0 15px 3px; - padding: 0 0 0 12px; - } - div.code pre:first-child { - border-left: 0; - } diff --git a/doc/rust-tutorial/docode.hs b/doc/rust-tutorial/docode.hs deleted file mode 100644 index 550c9fec15..0000000000 --- a/doc/rust-tutorial/docode.hs +++ /dev/null @@ -1,30 +0,0 @@ -module Main where - -import Control.Monad -import System.Process - -interactIO :: (String -> IO String) -> IO () -interactIO act = do - txt <- getContents - mapM_ (act >=> putStr) (lines txt) - --- TODO: this does no error handling whatsoever -processCommand :: String -> IO String -processCommand line = - case words line of - ("$cmd":cmd:args) -> do - result <- readProcess cmd args "" - return $ unlines [ unwords ("#" : cmd : args) ] ++ result - ["$include", "all", file] -> readFile file - ["$include", range, file] -> - do txt <- readFile file - return . unlines . take (endN - startN) $ drop startN (lines txt) - where - (start, end') = break (== '-') range - end = tail end' - startN = read start - 1 - endN = read end - _ -> return (line ++ "\n") - -main :: IO () -main = interactIO processCommand diff --git a/doc/rust-tutorial/figures/galois.pdf b/doc/rust-tutorial/figures/galois.pdf deleted file mode 100644 index 58cbef37a0..0000000000 Binary files a/doc/rust-tutorial/figures/galois.pdf and /dev/null differ diff --git a/doc/rust-tutorial/figures/galois.png b/doc/rust-tutorial/figures/galois.png deleted file mode 100644 index 39a5ba86cb..0000000000 Binary files a/doc/rust-tutorial/figures/galois.png and /dev/null differ diff --git a/doc/rust-tutorial/galois-whitepaper.cls b/doc/rust-tutorial/galois-whitepaper.cls deleted file mode 100644 index 9e47c94810..0000000000 --- a/doc/rust-tutorial/galois-whitepaper.cls +++ /dev/null @@ -1,143 +0,0 @@ -% This class file defines a style useful for writing whitepapers at Galois. -\ProvidesClass{galois-whitepaper}[2010/11/02 Galois whitepaper class] -\newif\if@xelatex -\@xelatexfalse -\DeclareOption{xelatex}{\@xelatextrue} -\DeclareOption*{% -\PassOptionsToClass{\CurrentOption}{article}% -} -\ProcessOptions\relax -\LoadClass{article} - -% Useful packages: -% \RequirePackage[table]{xcolor} -\RequirePackage{graphicx} -%\RequirePackage[headheight=144pt,top=1in,left=1in,right=1in,bottom=1in]{geometry} -\RequirePackage[top=1in,left=1in,right=1in,bottom=1in]{geometry} -\RequirePackage{lastpage} - -% Define fonts -\if@xelatex -\RequirePackage{fontspec} -\RequirePackage{xunicode} -\RequirePackage{xltxtra} -\setmainfont[Mapping=tex-text]{Palatino} -\setsansfont[Mapping=tex-text]{Arial} -\else -\RequirePackage[sc]{mathpazo} -\linespread{1.05} -\RequirePackage[scaled]{helvet} -\fi - -\RequirePackage{sectsty} -\allsectionsfont{\sffamily} - -\newcommand{\highlight}[2][yellow]{\colorbox{#1}{#2}} - -% Misc defines -\makeatletter - \newcommand{\@companyName}{Galois, Inc.} - \newcommand{\@addressPhone}{ - 421 SW 6th Ave., Ste.\ 300\\ - Portland, OR 97204 - } - \newcommand{\@contact}{\todo[Contact]} - \newcommand{\@subtitle}{TODO: Subtitle} - \newcommand{\contact}[1]{\renewcommand{\@contact}{{#1}}} -\makeatother - -\renewcommand{\paragraph}[1]{\textbf{\sffamily{#1}}\quad} - -% \definecolor{galoislightgray}{gray}{0.6} -% Define header and footer -\RequirePackage{fancyhdr} -\pagestyle{fancy} -\fancyheadoffset[l]{1in} -\fancyheadoffset[r]{0in} -%\fancyheadoffset[r]{0.2in} -\fancyhead{}%clear -\fancyfoot{}%clear -\renewcommand{\headrulewidth}{0pt} -\renewcommand{\footrulewidth}{1pt} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Define a special header style for the title page: -% - \makeatletter - \newcommand{\ps@title}{% - \renewcommand{\@oddhead}{% - \sffamily\includegraphics[width=1in]{figures/galois.pdf} - }} -\makeatother - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Set up the headers/footers for the rest of the document: -%% -\lhead{\sffamily\hspace{1in}\includegraphics[width=1in]{figures/galois.pdf}} - -\makeatletter -\rhead{\sffamily \textit{\@title{}}} -\makeatother - -\lfoot{\sffamily\color{lightgray}\small\copyright{} 2023 Galois, Inc.} - -\rfoot{ - % \sffamily \color{galoislightgray}\small - \sffamily \color{lightgray}\small - Page \thepage{}% of \pageref{LastPage} -} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% The title content definition: -%% -\makeatletter -\DeclareRobustCommand{\maketitle}{ - \thispagestyle{title} - \begin{center} - {\sffamily - {\large \textbf{}}\\ - \vspace{0.3in} - \textbf{\sffamily \LARGE \@title}\\ - %\vspace{0.5in} - %{ \@author }\\ - \vspace{0.3in} - {\large \@companyName }\\ - \vspace{0.025in} - {\small \@addressPhone }\\ - } - \end{center} -} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Add a stop ``.'' after each section number: -%% - -% Redefine the bibliography environment to use \section{\bibname} -% instead of \section*{\bibname} -\newcommand{\bibname}{References} -\renewenvironment{thebibliography}[1] - {\section*{\bibname}% <-- this line was changed from \chapter* to \section* - \@mkboth{\MakeUppercase\bibname}{\MakeUppercase\bibname}% - \list{\@biblabel{\@arabic\c@enumiv}}% - {\settowidth\labelwidth{\@biblabel{#1}}% - \leftmargin\labelwidth - \advance\leftmargin\labelsep - \@openbib@code - \usecounter{enumiv}% - \let\p@enumiv\@empty - \renewcommand\theenumiv{\@arabic\c@enumiv}}% - \sloppy - \clubpenalty4000 - \@clubpenalty \clubpenalty - \widowpenalty4000% - \sfcode`\.\@m} - {\def\@noitemerr - {\@latex@warning{Empty `thebibliography' environment}}% - \endlist} - -\makeatother diff --git a/doc/rust-tutorial/rust-tutorial-after.tex b/doc/rust-tutorial/rust-tutorial-after.tex deleted file mode 100644 index aae03f4149..0000000000 --- a/doc/rust-tutorial/rust-tutorial-after.tex +++ /dev/null @@ -1,4 +0,0 @@ -\newpage -\bibliography{sawScriptRustTutorial} -\addcontentsline{toc}{section}{References} -\bibliographystyle{abbrv} diff --git a/doc/rust-tutorial/rust-tutorial-before.html b/doc/rust-tutorial/rust-tutorial-before.html deleted file mode 100644 index 2ce0a49c74..0000000000 --- a/doc/rust-tutorial/rust-tutorial-before.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/doc/rust-tutorial/rust-tutorial-before.tex b/doc/rust-tutorial/rust-tutorial-before.tex deleted file mode 100644 index 2a8b21e849..0000000000 --- a/doc/rust-tutorial/rust-tutorial-before.tex +++ /dev/null @@ -1 +0,0 @@ -\maketitle diff --git a/doc/rust-tutorial/rust-tutorial-head.tex b/doc/rust-tutorial/rust-tutorial-head.tex deleted file mode 100644 index a38a1e6da2..0000000000 --- a/doc/rust-tutorial/rust-tutorial-head.tex +++ /dev/null @@ -1,60 +0,0 @@ -\usepackage{listings} -\usepackage{float} -\usepackage{xspace} -\usepackage{color} -\usepackage{tikz} -\usepackage{url} -\usepackage{amsmath} -\usepackage{amscd} -\usepackage{verbatim} -\usepackage{fancyvrb} - -\let\verbatiminput=\verbatimtabinput -\VerbatimFootnotes -\DefineVerbatimEnvironment{code}{Verbatim}{} -\DefineVerbatimEnvironment{pseudoCode}{Verbatim}{} -\hyphenation{SAW-Script} -\newcommand{\sawScript}{{\sc SAWScript}\xspace} -\renewcommand{\textfraction}{0.05} -\renewcommand{\topfraction}{0.95} -\renewcommand{\bottomfraction}{0.95} -\renewcommand{\floatpagefraction}{0.35} -\setcounter{totalnumber}{5} -\definecolor{MyGray}{rgb}{0.9,0.9,0.9} -\makeatletter\newenvironment{graybox}{% - \begin{lrbox}{\@tempboxa}\begin{minipage}{\columnwidth}}{\end{minipage}\end{lrbox}% - \colorbox{MyGray}{\usebox{\@tempboxa}} -}\makeatother - -\setlength{\parskip}{0.6em} -\setlength{\abovecaptionskip}{0.5em} - -\lstset{ - basicstyle=\footnotesize\ttfamily, % Standardschrift - %numbers=left, % Ort der Zeilennummern - numberstyle=\tiny, % Stil der Zeilennummern - %stepnumber=2, % Abstand zwischen den Zeilennummern - numbersep=5pt, % Abstand der Nummern zum Text - tabsize=2, % Groesse von Tabs - extendedchars=true, % - breaklines=true, % Zeilen werden Umgebrochen - keywordstyle=\color{red}, - frame=lrtb, % left, right, top, bottom frames. - % keywordstyle=[1]\textbf, % Stil der Keywords - % keywordstyle=[2]\textbf, % - % keywordstyle=[3]\textbf, % - % keywordstyle=[4]\textbf, \sqrt{\sqrt{}} % - stringstyle=\color{white}\ttfamily, % Farbe der String - showspaces=false, % Leerzeichen anzeigen ? - showtabs=false, % Tabs anzeigen ? - xleftmargin=10pt, % was 17 - xrightmargin=5pt, - framexleftmargin=5pt, % was 17 - framexrightmargin=-1pt, % was 5pt - framexbottommargin=4pt, - %backgroundcolor=\color{lightgray}, - showstringspaces=false % Leerzeichen in Strings anzeigen ? -} - -\author{The Galois SAW Team\\\texttt{saw@galois.com}} -\title{SAWScript Rust Tutorial} diff --git a/doc/rust-tutorial/rust-tutorial.md b/doc/rust-tutorial/rust-tutorial.md deleted file mode 100644 index 865e4ff602..0000000000 --- a/doc/rust-tutorial/rust-tutorial.md +++ /dev/null @@ -1,3171 +0,0 @@ -# A tutorial on Rust verification with SAW - -SAW is a special-purpose programming environment developed by Galois to help -orchestrate and track the results of the large collection of proof tools -necessary for analysis and verification of complex software artifacts. - -SAW adopts the functional paradigm, and largely follows the structure of many -other typed functional languages, with some special features specifically -targeted at the coordination of verification and analysis tasks. - -This tutorial introduces the details of the language by walking through several -examples, and showing how simple verification tasks can be described. The -complete examples are available [on -GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code). -Most of the examples make use of inline specifications written in Cryptol, a -language originally designed for high-level descriptions of cryptographic -algorithms. For readers unfamiliar with Cryptol, various documents describing -its use are available [here](http://cryptol.net/documentation.html). - -This tutorial also include a [case study](#case-study-salsa20) on how to use -SAW to verify a real-world implementation of the Salsa20 stream cipher based on -the [`stream-ciphers`](https://github.com/RustCrypto/stream-ciphers) Rust -project. The code for this case study is also available [on -GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20). - -## Prerequisites - -In order to run the examples in this tutorial, you will need to install the -following prerequisite tools: - -* SAW itself, which can be installed by following the instructions - [here](https://github.com/GaloisInc/saw-script#manual-installation). -* The Z3 and Yices SMT solvers. Z3 can be downloaded from - [here](https://github.com/Z3Prover/z3/releases), and Yices can be downloaded - from [here](https://yices.csl.sri.com/). -* The `mir-json` tool, which can be installed by following the instructions - [here](https://github.com/GaloisInc/mir-json#installation-instructions). - -## About `mir-json` - -We are interested in verifying code written in Rust, but Rust is an extremely -rich programming language that has many distinct language features. To make the -process of verifying Rust simpler, SAW targets an intermediate language used in -the Rust compiler called [MIR](https://blog.rust-lang.org/2016/04/19/MIR.html) -(short for "Mid-level IR"). MIR takes the variety of different features and -syntactic extensions in Rust and boils them down to a minimal subset that is -easier for a computer program to analyze. - -The process of extracting MIR code that is suitable for SAW's needs is somewhat -tricky, so we wrote a suite of tools called `mir-json` to automate this -process. `mir-json` provides a plugin for tools like `rustc` and `cargo` that -lets you compile Rust code as you normally would and produce an additional -`.json` file as output. This `.json` file contains the MIR code that was -produced internally by the Rust compiler, with some additional minor tweaks to -make it suitable for SAW's needs. - -`mir-json` is not a single tool but rather a suite of related tools that -leverage the same underlying plugin. For SAW purposes, the two `mir-json` tools -that are most relevant are: - -* `saw-rustc`: A thin wrapper around `rustc` (the Rust compiler), which is - suitable for individual `.rs` files. -* `cargo-saw-build`: A thin wrapper around the `cargo build` command, which is - suitable for `cargo`-based Rust projects. - -Most of the examples in this tutorial involve self-contained examples, which -will use `saw-rustc`. Later in the tutorial, we will examine a Salsa20 case -study that involves a `cargo`-based project, which will use `cargo-saw-build`. - -### A first example with `saw-rustc` - -Let's try out `saw-rustc` on a small example file, which we'll name -`first-example.rs`: - -``` {.rs} -$include all code/first-example.rs -``` - -This is the identity function, but specialized to the type `u8`. We can compile -this with `saw-rustc` like so: - -``` -$ saw-rustc first-example.rs - -note: Emitting MIR for first_example/abef32c5::id_u8 - -linking 1 mir files into first-example.linked-mir.json - -``` - -`saw-rustc` prints out some additional information that `rustc` alone does not -print, and we have displayed the parts of this information that are most -interesting. In particular: - -* `saw-rustc` notes that is is `Emitting MIR for first_example/abef32c5::id_u8`, - where `first_example/abef32c5::id_u8` is the full _identifier_ used to uniquely - refer to the `id_u8` function. It's entirely possible that the `abef32c5` - bit will look different on your machine; we'll talk more about identifiers in - the "Identifiers" section. -* Once `saw-rustc` produced a MIR JSON file named - `first-example.linked-mir.json`. This is an important bit of information, as - SAW will ingest this JSON file. - -If you'd like, you can inspect the `first-example.linked-mir.json` file with -JSON tools (e.g., [`jq`](https://jqlang.github.io/jq/)), but it is not -important to understand everything that is going on there. This is -machine-generated JSON, and as such, it is not meant to be particularly -readable by human eyes. - -### The `SAW_RUST_LIBRARY_PATH` environment variable - -Rust has a large set of standard libraries that ship with the compiler, and -parts of the standard library are quite low-level and tricky. SAW's primary -objective is to provide a tool that can analyze code in a tractable way. For -this reason, SAW sometimes needs to invoke simplified versions of Rust standard -library functions that are more reasonable for an SMT-based tool like SAW to -handle. These simplified functions are equivalent in behavior, but avoid using -problematic code patterns (e.g., gnarly pointer arithmetic or the -[`transmute`](https://doc.rust-lang.org/std/intrinsics/fn.transmute.html) -function). - -If you are only ever compiling self-contained pieces of code with `saw-rustc`, -there is a good chance that you can get away without needing to use SAW's -custom version of the Rust standard libraries. However, if you ever need to -build something more complicated than that (e.g., the Salsa20 case study later -in this tutorial), then you _will_ need the custom libraries. For this reason, -it is worthwhile to teach SAW the location of the custom libraries now. - -At present, the best way to obtain the custom version of the Rust standard -libraries is to perform the following steps: - -1. Clone the [`crucible`](https://github.com/GaloisInc/crucible) repo like so: - - ``` - $ git clone https://github.com/GaloisInc/crucible - ``` - -2. Navigate to the - [`crux-mir`](https://github.com/GaloisInc/crucible/tree/master/crux-mir) - subdirectory of the `crucible` checkout: - - ``` - $ cd crucible/crux-mir/ - ``` - -3. Run the `translate_libs.sh` script: - - ``` - $ ./translate_libs.sh - ``` - - This will compile the custom versions of the Rust standard libraries using - `mir-json`, placing the results under the `rlibs` subdirectory. - -4. Finally, define a `SAW_RUST_LIBRARY_PATH` environment variable that points - to the newly created `rlibs` subdirectory: - - ``` - $ export SAW_RUST_LIBRARY_PATH=<...>/crucible/crux-mir/rlibs - ``` - -An upcoming release of SAW will include these custom libraries pre-built, which -will greatly simplify the steps above. Either way, you will need to set the -`SAW_RUST_LIBRARY_PATH` environment variable to point to the location of the -custom libraries. - -### A note about generics - -The `id_u8` function above is likely not how most Rust programmers would define -the identity function. Instead, it would seem more natural to define it -generically, that is, by parameterizing the function by a type parameter: - -``` {.rs} -$include all code/generics-take-1.rs -``` - -If you compile this with `saw-rustc`, however, the resulting JSON file will -lack a definition for `id`! We can see this by using `jq`: - -``` -$ saw-rustc generics-take-1.rs - -$ jq . generics-take-1.linked-mir.json -{ - "fns": [], - "adts": [], - "statics": [], - "vtables": [], - "traits": [], - "intrinsics": [], - "tys": [], - "roots": [] -} -``` - -What is going on here? This is the result of an important design choice that -SAW makes: _SAW only supports verifying monomorphic functions_. To be more -precise, SAW's approach to symbolic simulation requires all of the code being -simulated to have fixed types without any type parameters. - -In order to verify a function using generics in your Rust code, you must -provide a separate, monomorphic function that calls into the generic function. -For example, you can rewrite the example above like so: - -``` {.rs} -$include all code/generics-take-2.rs -``` - -If you compile this version with `saw-rustc`, you'll see: - -``` -$ saw-rustc generics-take-2.rs - -note: Emitting MIR for generics_take_2/8b1bf337::id_u8 - -note: Emitting MIR for generics_take_2/8b1bf337::id::_instaddce72e1232152c[0] - -linking 1 mir files into generics-take-2.linked-mir.json - -``` - -This time, the resulting JSON file contains a definition for `id_u8`. The -reason that this works is because when `id_u8` calls `id`, the Rust compile -will generate a specialized version of `id` where `A` is instantiated with the -type `u8`. This specialized version of `id` is named -`id::_instaddce72e1232152c[0]` in the output above. (You don't have to remember -this name, thankfully!) - -Although the resulting JSON file contains a definition for `id_u8`, it does -_not_ contain a definition for the generic `id` function. As a result, SAW will -only be able to verify the `id_u8` function from this file. If you are ever in -doubt about which functions are accessible for verification with SAW, you can -check this with `jq` like so: - -``` -$ jq '.intrinsics | map(.name)' generics-take-2.linked-mir.json -[ - "generics_take_2/8b1bf337::id_u8", - "generics_take_2/8b1bf337::id::_instaddce72e1232152c[0]" -] -``` - -Here, "intrinsics" are monomorphic functions that are visible to SAW. Note that -`saw-rustc` will optimize away all definitions that are not accessible from one -of these intrinsic functions. This explains why the original program that only -defined a generic `id` function resulted in a definition-less JSON file, as -that program did not contain monomorphic functions (and therefore no -intrinsics). - -Generally speaking, we prefer to verify functions that are defined directly in -the Rust source code, such as `id_u8`, as these functions' names are more -stable than the specialized versions of functions that the compiler generates, -such as `id::_instaddce72e1232152c[0]`. Do note that SAW is capable of -verifying both types of functions, however. (We will see an example of -verifying an autogenerated function in the Salsa20 case study later in this -tutorial.) - -### Identifiers - -When you compile a function named `id_u8`, `saw-rustc` will expand it to a much -longer name such as `first_example/abef32c5::id_u8`. This longer name is called -an _identifier_, and it provides a globally unique name for that function. In -the small examples we've seen up to this point, there hasn't been any risk of -name collisions, but you could imagine compiling this code alongside another -file (or crate) that also defines an `id_u8` function. If that happens, then it -is essential that we can tell apart all of the different `id_u8` functions, and -identifiers provide us the mechanism for doing so. - -Let's take a closer look at what goes into an identifier. In general, an identifier -will look like the following: - -* `/::` - -`` is the name of the crate in which the function is defined. All -of the examples we've seen up to this point have been defined in standalone -files, and as a result, the crate name has been the same as the file name, but -without the `.rs` extension and with all hyphens replaced with underscores -(e.g., `first-example.rs` is given the crate name `first_example`). In -`cargo`-based projects, the crate name will likely differ from the file name. - -`` is a hash of the crate and its dependencies. In extreme -cases, it is possible for two different crates to have identical crate names, -in which case the disambiguator must be used to distinguish between the two -crates. In the common case, however, most crate names will correspond to -exactly one disambiguator. (More on this in a bit.) - -`` is the path to the function within the crate. Sometimes, this -is as simple as the function name itself. In other cases, a function path may -involve multiple _segments_, depending on the module hierarchy for the program -being verified. For instance, a `read` function located in -`core/src/ptr/mod.rs` will have the identifier: - -``` -core/::ptr::read -``` - -Where `core` is the crate name and `ptr::read` is the function path, which -has two segments `ptr` and `read`. There are also some special forms of -segments that appear for functions defined in certain language constructs. -For instance, if a function is defined in an `impl` block, then it will have -`{impl}` as one of its segments, e.g., - -``` -core/::ptr::const_ptr::{impl}::offset -``` - -The most cumbersome part of writing an identifier is the disambiguator, as it -is extremely sensitive to changes in the code (not to mention hard to remember -and type). Luckily, the vast majority of crate names correspond to exactly one -disambiguator, and we can exploit this fact to abbreviate identifiers that live -in such crates. For instance, we can abbreviate this identifier: - -``` -core/::ptr::read -``` - -To simply: - -``` -core::ptr::read -``` - -We will adopt the latter, shorter notation throughout the rest of the tutorial. -SAW also understands this shorthand, so we will also use this notation when -passing identifiers to SAW commands. - -## SAW basics - -### A first SAW example - -We now have the knowledge necessary to compile Rust code in a way that is -suitable for SAW. Let's put our skills to the test and verify something! We will -build on the example from above, which we will put into a file named -`saw-basics.rs`: - -``` {.rs} -$include all code/saw-basics.rs -``` - -Our goal is to verify the correctness of the `id_u8` function. However, it is -meaningless to talk about whether a function is correct without having a -_specification_ for how the function should behave. This is where SAW enters -the picture. SAW provides a scripting language named _SAWScript_ that allows -you to write a precise specification for describing a function's behavior. For -example, here is a specification that captures the intended behavior of -`id_u8`: - -``` -$include 3-7 code/saw-basics.saw -``` - -At a high level, this specification says that `id_u8` is a function that accepts -a single argument of type `u8`, and it returns its argument unchanged. Nothing -too surprising there, but this example illustrates many of the concepts that one -must use when working with SAW. Let's unpack what this is doing, line by line: - -* In SAWScript, specifications are ordinary values that are defined with `let`. - In this example, we are defining a specification named `id_u8_spec`. -* Specifications are defined using "`do`-notation". That is, they are assembled - by writing `do { ; ; ...; ; }`, where each `` is a - statement that declares some property about the function being verified. A - statement can optionally bind a variable that can be passed to later - statements, which is accomplished by writing ` <- `. -* The `x <- mir_fresh_var "x" mir_u8;` line declares that `x` is a fresh - variable of type `u8` (represented by `mir_u8` in SAWScript) that has some - unspecified value. In SAW parlance, we refer to these unspecified values as - _symbolic_ values. SAW uses an SMT solver under the hood to reason about - symbolic values. - - The `"x"` string indicates what name the variable `x` should have when sent - to the underlying SMT solver. This is primarily meant as a debugging aid, and - it is not required that the string match the name of the SAWScript variable. - (For instance, you could just as well have passed `"x_smt"` or something - else.) -* The `mir_execute_func [mir_term x];` line declares that the function should - be invoked with `x` as the argument. For technical reasons, we pass - `mir_term x` to `mir_execute_func` rather than just `x`; we will go over what - `mir_term` does later in the tutorial. -* Finally, the `mir_return (mir_term x);` line declares that the function should - return `x` once it has finished. - -Now that we have a specification in hand, it's time to prove that `id_u8` -actually adheres to the spec. To do so, we need to load the MIR JSON version of -`id_u8` into SAW, which is done with the `mir_load_module` command: - -``` -$include 9-9 code/saw-basics.saw -``` - -This `m` variable contains the definition of `id_u8`, as well as the other code -defined in the program. We can then pass `m` to the `mir_verify` command, which -actually verifies that `id_u8` behaves according to `id_u8_spec`: - -``` -$include 11-11 code/saw-basics.saw -``` - -Here is what is going on in this command: - -* The `m` and `"saw_basics::id_u8"` arguments instruct SAW to verify the - `id_u8` function located in the `saw_basics` crate defined in `m`. Note that - we are using the shorthand identifier notation here, so we are allowed to omit - the disambiguator for the `saw_basics` crate. -* The `[]` argument indicates that we will not provide any function overrides to - use when SAW simulates the `id_u8` function. (We will go over how overrides - work later in the tutorial.) -* The `false` argument indicates that SAW should not use path satisfiability - checking when analyzing the function. Path satisfiability checking is an - advanced SAW feature that we will not be making use of in this tutorial, so we - will always use `false` here. -* The `id_u8_spec` argument indicates that `id_u8` should be checked against the - specification defined by `id_u8_spec`. -* The `z3` argument indicates that SAW should use the Z3 SMT solver to solve - any proof goals that are generated during verification. SAW also supports - other SMT solvers, although we will mostly use Z3 in this tutorial. - -Putting this all together, our complete `saw-basics.saw` file is: - -``` -$include all code/saw-basics.saw -``` - -One minor detail that we left out until just now is that the SAW's interface to -MIR is still experimental, so you must explicitly opt into it with the -`enable_experimental` command. - -Now that everything is in place, we can check this proof like so: - -``` -$ saw saw-basics.saw - - - -[16:14:07.006] Loading file "saw-basics.saw" -[16:14:07.009] Verifying saw_basics/f77ebf43::id_u8[0] ... -[16:14:07.017] Simulating saw_basics/f77ebf43::id_u8[0] ... -[16:14:07.017] Checking proof obligations saw_basics/f77ebf43::id_u8[0] ... -[16:14:07.017] Proof succeeded! saw_basics/f77ebf43::id_u8[0] -``` - -Tada! SAW was successfully able to prove that `id_u8` adheres to its spec. - -### Cryptol - -The spec in the previous section is nice and simple. It's also not very -interesting, as it's fairly obvious at a glance that `id_u8`'s implementation -is correct. Most of the time, we want to verify more complicated functions -where the correspondence between the specification and the implementation is -not always so clear. - -For example, consider this function, which multiplies a number by two: - -``` {.rs} -$include 1-3 code/times-two.rs -``` - -The straightforward way to implement this function would be to return `2 * x`, -but the author of this function _really_ cared about performance. As such, the -author applied a micro-optimization that computes the multiplication with a -single left-shift (`<<`). This is the sort of scenario where we are pretty sure -that the optimized version of the code is equivalent to the original version, -but it would be nice for SAW to check this. - -Let's write a specification for the `times_two` function: - -``` -$include 3-7 code/times-two.saw -``` - -This spec introduces code delimited by double curly braces `{{ ... }}`, which -is a piece of syntax that we haven't seen before. The code in between the curly -braces is written in [Cryptol](http://cryptol.net/documentation.html), a -language designed for writing high-level specifications of various algorithms. -Cryptol supports most arithmetic operations, so `2 * x` works exactly as you -would expect. Also note that the `x` variable was originally bound in the -SAWScript language, but it is possible to embed `x` into the Cryptol language -by referencing `x` within the curly braces. (We'll say more about how this -embedding works later.) - -`{{ 2 * x }}` takes the Cryptol expression `2 * x` and lifts it to a SAW -expression. As such, this SAW spec declares that the function takes a single -`u32`-typed argument `x` and returns `2 * x`. We could have also wrote the -specification to declare that the function returns `x << 1`, but that would -have defeated the point of this exercise: we specifically want to check that -the function against a spec that is as simple and readable as possible. - -Our full SAW file is: - -``` -$include 1-11 code/times-two.saw -``` - -Which we can verify is correct like so: - -``` -$ saw times-two.saw - - - -[17:51:35.469] Loading file "times-two.saw" -[17:51:35.497] Verifying times_two/6f4e41af::times_two[0] ... -[17:51:35.512] Simulating times_two/6f4e41af::times_two[0] ... -[17:51:35.513] Checking proof obligations times_two/6f4e41af::times_two[0] ... -[17:51:35.527] Proof succeeded! times_two/6f4e41af::times_two[0] -``` - -Nice! Even though the `times_two` function does not literally return `2 * x`, -SAW is able to confirm that the function behaves as if it were implemented that -way. - -### `Term`s and other types - -Now that we know how Cryptol can be used within SAW, we can go back and explain -what the `mir_term` function does. It is helpful to examine the type of -`mir_term` by using SAW's interactive mode. To do so, run the `saw` binary -without any other arguments: - -``` -$ saw -``` - -Then run `enable_experimental` (to enable MIR-related commands) and run `:type -mir_term`: - -``` -sawscript> enable_experimental -sawscript> :type mir_term -Term -> MIRValue -``` - -Here, we see that `mir_term` accepts a `Term` as an argument and returns a -`MIRValue`. In this context, the `Term` type represents a Cryptol value, and -the `MIRValue` type represents SAW-related MIR values. `Term`s can be thought -of as a subset of `MIRValue`s, and the `mir_term` function is used to promote a -`Term` to a `MIRValue`. - -Most other MIR-related commands work over `MIRValue`s, as can be seen with -SAW's `:type` command: - -``` -sawscript> :type mir_execute_func -[MIRValue] -> MIRSetup () -sawscript> :type mir_return -MIRValue -> MIRSetup () -``` - -Note that `MIRSetup` is the type of statements in a MIR specification, and two -`MIRSetup`-typed commands can be chained together by using `do`-notation. -Writing `MIRSetup ()` means that the statement does not return anything -interesting, and the use of `()` here is very much analogous to how `()` is -used in Rust. There are other `MIRSetup`-typed commands that _do_ return -something interesting, as is the case with `mir_fresh_var`: - -``` -sawscript> :type mir_fresh_var -String -> MIRType -> MIRSetup Term -``` - -This command returns a `MIRSetup Term`, which means that when you write `x <- -mir_fresh_var ... ...` in a MIR specification, then `x` will be bound at type -`Term`. - -Values of type `Term` have the property that they can be embedded into Cryptol -expression that are enclosed in double curly braces `{{ ... }}`. This is why -our earlier `{{ 2 * x }}` example works, as `x` is of type `Term`. - -### Preconditions and postconditions - -As a sanity check, let's write a naïve version of `times_two` that explicitly -returns `2 * x`: - -``` {.rs} -$include 5-7 code/times-two.rs -``` - -It seems like we should be able to verify this `times_two_ref` function using -the same spec that we used for `times_two`: - -``` -$include 11-11 code/times-two-ref-fail.saw -``` - -Somewhat surprisingly, SAW fails to verify this function: - -``` -$ saw times-two-ref-fail.saw - - - -[18:58:22.578] Loading file "times-two-ref-fail.saw" -[18:58:22.608] Verifying times_two/56182919::times_two_ref[0] ... -[18:58:22.621] Simulating times_two/56182919::times_two_ref[0] ... -[18:58:22.622] Checking proof obligations times_two/56182919::times_two_ref[0] ... -[18:58:22.640] Subgoal failed: times_two/56182919::times_two_ref[0] attempt to compute `const 2_u32 * move _2`, which would overflow -[18:58:22.640] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 375} -[18:58:22.640] ----------Counterexample---------- -[18:58:22.640] x: 2147483648 -[18:58:22.640] Stack trace: -"mir_verify" (times-two-ref-fail.saw:11:1-11:11) -Proof failed. -``` - -The "`which would overflow`" portion of the error message suggests what went -wrong. When a Rust program is compiled with debug settings (which is the -default for `rustc` and `saw-rustc`), arithmetic operations such as -multiplication will check if the result of the operation can fit in the -requested number of bits. If not, the program will raise an error. - -In this case, we must make the result of multiplication fit in a `u32`, which -can represent values in the range `0` to `2^^32 - 1` (where `^^` is Cryptol's -exponentiation operator). But it is possible to take a number in this range, -multiply it by two, and have the result fall outside of the range. In fact, SAW -gives us a counterexample with exactly this number: `2147483648`, which can -also be written as `2^^31`. Multiplying this by two yields `2^^32`, which is -just outside of the range of values expressible with `u32`. SAW's duties -include checking that a function cannot fail at runtime, so this function falls -afoul of that check. - -Note that we didn't have this problem with the original definition of -`times_two` because the semantics of `<<` are such that if the result is too -large to fit in the requested type, then the result will _overflow_, i.e., wrap -back around to zero and count up. This means that `(2^^31) << 1` evaluates to -`0` rather than raising an error. Cryptol's multiplication operation also -performs integer overflow (unlike Rust in debug settings), which is why we -didn't notice any overflow-related issues when verifying `times_two`. - -There are two possible ways that we can repair this. One way is to rewrite -`times_two_ref` to use Rust's -[`wrapping_mul`](https://doc.rust-lang.org/std/primitive.u32.html#method.wrapping_mul) -function, a variant of multiplication that always uses integer overflow. This -work around the issue, but it is a bit more verbose. - -The other way is to make our spec more precise such that we only verify -`times_two_ref` for particular inputs. Although `times_two_ref` will run into -overflow-related issues when the argument is `2^^31` or greater, it is -perfectly fine for inputs smaller than `2^^31`. We can encode such an -assumption in SAW by adding a _precondition_. To do so, we write a slightly -modified version of `times_two_spec`: - -``` -$include 13-18 code/times-two.saw -``` - -The most notable change is the `mir_precond {{ x < 2^^31 }};` line. -`mir_precond` (where "`precond`" is short for "precondition") is a command that -takes a `Term` argument that contains a boolean predicate, such as `{{ x < -2^^31 }}`. Declaring a precondition requires that this predicate must hold -during verification, and any values of `x` that do not satisfy this predicate -are not considered. - -By doing this, we have limited the range of the function from `0` to `2^^31 - -1`, which is exactly the range of values for which `times_two_ref` is well -defined. SAW will confirm this if we run it: - -``` -$include 20-20 code/times-two.saw -``` -``` -[19:23:53.480] Verifying times_two/56182919::times_two_ref[0] ... -[19:23:53.496] Simulating times_two/56182919::times_two_ref[0] ... -[19:23:53.497] Checking proof obligations times_two/56182919::times_two_ref[0] ... -[19:23:53.531] Proof succeeded! times_two/56182919::times_two_ref[0] -``` - -We can add as many preconditions to a spec as we see fit. For instance, if we -only want to verify `times_two_ref` for positive integers, we could add an -additional assumption: - -``` -$include 22-28 code/times-two.saw -``` - -In addition to preconditions, SAW also supports postconditions. Whereas -preconditions represent conditions that must hold _before_ invoking a function, -postconditions represent conditions that must hold _after_ invoking a function. -We have already seen one type of postcondition in the form of the `mir_return` -command, which imposes a postcondition on what the return value must be equal -to. - -We can introduce additional postconditions with the `mir_postcond` command. -For example, if we call `times_two_ref` with a positive argument, then it -should be the case that the return value should be strictly greater than the -argument value. We can check for this using `mir_postcond` like so: - -``` -$include 32-39 code/times-two.saw -``` - -An additional convenience that SAW offers is the `mir_assert` command. -`mir_assert` has the same type as `mir_precond` and `mir_postcond`, but -`mir_assert` can be used to declare both preconditions _and_ postconditions. -The difference is where `mir_assert` appears in a specification. If -`mir_assert` is used before the call to `mir_execute_func`, then it declares a -precondition. If `mir_assert` is used after the call to `mir_execute_func`, -then it declares a postcondition. - -For example, we can rewrite `times_two_ref_positive_postcond_spec` to use -`mir_assert`s like so: - -``` -$include 43-50 code/times-two.saw -``` - -The choice of whether to use `mir_precond`/`mir_postcond` versus `mir_assert` is -mostly a matter personal taste. - -## Reference types - -All of the examples we have seen up to this point involve simple integer types -such as `u8` and `u32`. While these are useful, Rust's type system features -much more than just integers. A key part of Rust's type system are its -reference types. For example, in this `read_ref` function: - -``` {.rs} -$include 1-3 code/ref-basics.rs -``` - -The function reads the value that `r` (of type `&u32`) points to and returns -it. Writing SAW specifications involving references is somewhat trickier than -with other types of values because we must also specify what memory the -reference points to. SAW provides a special command for doing this called -`mir_alloc`: - -``` -sawscript> :type mir_alloc -MIRType -> MIRSetup MIRValue -``` - -`mir_alloc` will allocate a reference value with enough space to hold a value -of the given `MIRType`. Unlike `mir_fresh_var`, `mir_alloc` returns a -`MIRValue` instead of a `Term`. We mentioned before that `Term`s are only a -subset of `MIRValue`s, and this is one of the reasons why. Cryptol does not -have a notion of reference values, but `MIRValue`s do. As a result, you cannot -embed the result of a call to `mir_alloc` in a Cryptol expression. - -`mir_alloc` must be used with some care. Here is a first, not-quite-correct -attempt at writing a spec for `read_ref` using `mir_alloc`: - -``` -$include 3-11 code/ref-basics-fail.saw -``` - -As the comment suggests, it's not entirely clear what this spec should return. -We can't return `r`, since `read_ref` returns something of type `u32`, not -`&u32`. On the other hand, we don't have any values of type `u32` lying around -that are obviously the right thing to use here. Nevertheless, it's not required -for a SAW spec to include a `mir_return` statement, so let's see what happens -if we verify this as-is: - -``` -$ saw ref-basics-fail.saw - - - -[20:13:27.224] Loading file "ref-basics-fail.saw" -[20:13:27.227] Verifying ref_basics/54ae7b63::read_ref[0] ... -[20:13:27.235] Simulating ref_basics/54ae7b63::read_ref[0] ... -[20:13:27.235] Stack trace: -"mir_verify" (ref-basics-fail.saw:11:1-11:11) -Symbolic execution failed. -Abort due to assertion failure: - ref-basics.rs:2:5: 2:7: error: in ref_basics/54ae7b63::read_ref[0] - attempted to read empty mux tree -``` - -Clearly, SAW didn't like what we gave it. The reason this happens is although -we allocated memory for the reference `r`, we never told SAW what value should -live in that memory. When SAW simulated the `read_ref` function, it attempted -to dereference `r`, which pointed to uninitialized memory. This is constitutes -an error in SAW, which is what this "`attempted to read empty mux tree`" -business is about. - -SAW provides a `mir_points_to` command to declare what value a reference should -point to: - -``` -sawscript> :type mir_points_to -MIRValue -> MIRValue -> MIRSetup () -``` - -Here, the first `MIRValue` argument represents a reference value, and the -second `MIRValue` argument represents the value that the reference should point -to. In our spec for `read_ref`, we can declare that the reference should point -to a symbolic `u32` value like so: - -``` -$include 3-9 code/ref-basics.saw -``` - -We have renamed `r` to `r_ref` in this revised spec to more easily distinguish -it from `r_val`, which is the value that `r_ref` is declared to point to using -`mir_points_to`. In this version of the spec, it is clear that we should return -`r_val` using `mir_return`, as `r_val` is exactly the value that will be -computed by dereferencing `r_ref`. - -This pattern, where a call to `mir_alloc`/`mir_alloc_mut` to followed by a call -to `mir_points_to`, is common with function specs that involve references. -Later in the tutorial, we will see other examples of `mir_points_to` where the -reference argument does not come from `mir_alloc`/`mir_alloc_mut`. - -The argument to `read_ref` is an immutable reference, so the implementation of -the function is not allowed to modify the memory that the argument points to. -Rust also features mutable references that do permit modifying the underlying -memory, as seen in this `swap` function: - -``` {.rs} -$include 5-11 code/ref-basics.rs -``` - -A corresponding spec for `swap` is: - -``` -$include 15-28 code/ref-basics.saw -``` - -There are two interesting things worth calling out in this spec: - -1. Instead of allocating the reference values with `mir_alloc`, we instead use - `mir_alloc_mut`. This is a consequence of the fact that `&mut u32` is a - different type from `&mut` in Rust (and in MIR), and and such, we need a - separate `mir_alloc_mut` to get the types right. -2. This spec features calls to `mir_points_to` before _and_ after - `mir_execute_func`. This is because the values that `a_ref` and `b_ref` point - to before calling the function are different than the values that they point - to after calling the function. The two uses to `mir_points_to` after the - function has been called swap the order of `a_val` and `b_val`, reflecting - the fact that the `swap` function itself swaps the values that the references - point to. - -## Compound data types - -Besides integer types and reference types, Rust also features a variety of other -interesting data types. This part of the tutorial will briefly go over some of -these data types and how to interface with them in SAW. - -### Array types - -Rust includes array types where the length of the array is known ahead of time. -For instance, this `index` function takes an `arr` argument that must contain -exactly three `u32` values: - -``` {.rs} -$include 1-3 code/arrays.rs -``` - -While Rust is good at catching many classes of programmer errors at compile -time, one thing that it cannot catch in general is out-of-bounds array -accesses. In this `index` example, calling the function with a value of `idx` -ranging from `0` to `2` is fine, but any other choice of `idx` will cause the -program to crash, since the `idx` will be out of the bounds of `arr`. - -SAW is suited to checking for these sorts of out-of-bound accesses. Let's write -an incorrect spec for `index` to illustrate this: - -``` -$include 3-14 code/arrays-fail.saw -``` - -Before we run this with SAW, let's highlight some of the new concepts that this -spec uses: - -1. The type of the `arr` variable is specified using `mir_array 3 mir_u32`. - Here, the `mir_array` function takes the length of the array and the element - type as arguments, just as in Rust. -2. The spec declares the return value to be `{{ arr @ idx }}`, where `@` is - Cryptol's indexing operator. Also note that it is completely valid to embed - a MIR array type into a Cryptol expression, as Cryptol has a sequence type - that acts much like arrays do in MIR. - -As we hinted above, this spec is wrong, as it says that this should work for -_any_ possible values of `idx`. SAW will catch this mistake: - -``` -$ saw arrays-fail.saw - - - -[21:03:05.374] Loading file "arrays-fail.saw" -[21:03:05.411] Verifying arrays/47a26581::index[0] ... -[21:03:05.425] Simulating arrays/47a26581::index[0] ... -[21:03:05.426] Checking proof obligations arrays/47a26581::index[0] ... -[21:03:05.445] Subgoal failed: arrays/47a26581::index[0] index out of bounds: the length is move _4 but the index is _3 -[21:03:05.445] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 53} -[21:03:05.445] ----------Counterexample---------- -[21:03:05.445] idx: 2147483648 -[21:03:05.445] Stack trace: -"mir_verify" (arrays-fail.saw:14:1-14:11) -Proof failed. -``` - -We can repair this spec by adding some preconditions: - -``` -$include 3-12 code/arrays.saw -``` - -An alternative way of writing this spec is by using SAW's `mir_array_value` -command: - -``` -sawscript> :type mir_array_value -MIRType -> [MIRValue] -> MIRValue -``` - -Here, the `MIRType` argument represents the element type, and the list of -`MIRValue` arguments are the element values of the array. We can rewrite -`index_spec` using `mir_array_value` like so: - -``` -$include 18-30 code/arrays.saw -``` - -Here, `[arr0, arr1, arr2]` is Cryptol notation for constructing a length-3 -sequence consisting of `arr0`, `arr1`, and `arr2` as the elements. -`index_alt_spec` is equivalent to `index_spec`, albeit more verbose. For this -reason, it is usually preferable to use `mir_fresh_var` to create an entire -symbolic array rather than creating separate symbolic values for each element -and combining them with `mir_array_value`. - -There are some situations where `mir_array_value` is the only viable choice, -however. Consider this variant of the `index` function: - -``` {.rs} -$include 5-7 code/arrays.rs -``` - -When writing a SAW spec for `index_ref_arr`, we can't just create a symbolic -variable for `arr` using `mir_alloc (mir_array 3 ...)`, as the reference values -in the array wouldn't point to valid memory. Instead, we must individually -allocate the elements of `arr` using separate calls to `mir_alloc` and then -build up the array using `mir_array_value`. (As an exercise, try writing and -verifying a spec for `index_ref_arr`). - -### Tuple types - -Rust includes tuple types where the elements of the tuple can be of different -types. For example: - -``` {.rs} -$include 1-3 code/tuples.rs -``` - -SAW includes a `mir_tuple` function for specifying the type of a tuple value. -In addition, one can embed MIR tuples into Cryptol, as Cryptol also includes -tuple types whose fields can be indexed with `.0`, `.1`, etc. Here is a spec -for `flip` that makes use of all these features: - -``` -$include 3-9 code/tuples.saw -``` - -SAW also includes a `mir_tuple_value` function for constructing a tuple value -from other `MIRValue`s: - -``` -sawscript> :type mir_tuple_value -[MIRValue] -> MIRValue -``` - -`mir_tuple_value` plays a similar role for tuples as `mir_array_value` does for -arrays. - -### Struct types - -Rust supports the ability for users to define custom struct types. Structs are -uniquely identified by their names, so if you have two structs like these: - -``` {.rs} -$include 1-2 code/structs.rs -``` - -Then even though the fields of the `S` and `T` structs are the same, they are -_not_ the same struct. This is a type system feature that Cryptol does not -have, and for this reason, it is not possible to embed MIR struct values into -Cryptol. It is also not possible to use `mir_fresh_var` to create a symbolic -struct value. Instead, one can use the `mir_struct_value` command: - -``` -sawscript> :type mir_struct_value -MIRAdt -> [MIRValue] -> MIRValue -``` - -Like with `mir_array_value` and `mir_tuple_value`, the `mir_struct_value` -function takes a list of `MIRValue`s as arguments. What makes -`mir_struct_value` unique is its `MIRAdt` argument, which we have not seen up -to this point. In this context, "`Adt`" is shorthand for "[algebraic data -type](https://en.wikipedia.org/wiki/Algebraic_data_type)", and Rust's structs -are an example of ADTs. (Rust also supports enums, another type of ADT that we -will see later in this tutorial.) - -ADTs in Rust are named entities, and as such, they have unique identifiers in -the MIR JSON file in which they are defined. Looking up these identifiers can -be somewhat error-prone, so SAW offers a `mir_find_adt` command that computes -an ADT's identifier and returns the `MIRAdt` associated with it: - -``` -sawscript> :type mir_find_adt -MIRModule -> String -> [MIRType] -> MIRAdt -``` - -Here, `MIRModule` correspond to the MIR JSON file containing the ADT -definition, and the `String` is the name of the ADT whose identifier we want to -look up. The list of `MIRType`s represent types to instantiate any type -parameters to the struct (more on this in a bit). - -As an example, we can look up the `S` and `T` structs from above like so: - -``` -$include 3-6 code/structs.saw -``` - -We pass an empty list of `MIRType`s to each use of `mir_find_adt`, as neither -`S` nor `T` have any type parameters. An example of a struct that does include -type parameters can be seen here: - -``` {.rs} -$include 12-12 code/structs.rs -``` - -As mentioned before, SAW doesn't support generic definitions out of the box, so -the only way that we can make use of the `Foo` struct is by looking up a -particular instantiation of `Foo`'s type parameters. If we define a function -like this, for example: - -``` {.rs} -$include 14-16 code/structs.rs -``` - -Then this function instantiates `Foo`'s `A` type parameter with `u32` and the -`B` type parameter with `u64`. We can use `mir_find_adt` to look up this -particular instantiation of `Foo` like so: - -``` -$include 7-7 code/structs.saw -``` - -In general, a MIR JSON file can have many separate instantiations of a single -struct's type parameters, and each instantiation must be looked up separately -using `mir_find_adt`. - -Having looked up `Foo` using `mir_find_adt`, let's use the resulting -`MIRAdt` in a spec: - -``` -$include 9-18 code/structs.saw -``` - -Note that we are directly writing out the values `27` and `42` in Cryptol. -Cryptol's numeric literals can take on many different types, so in order to -disambiguate which type they should be, we give each numeric literal an -explicit type annotation. For instance, the expression `27 : [32]` means that -`27` should be a 32-bit integer. - -#### Symbolic structs - -Let's now verify a function that takes a struct value as an argument: - -``` {.rs} -$include 18-22 code/structs.rs -``` - -Moreover, let's verify this function for all possible `Bar` values. One way to -do this is to write a SAW spec that constructs a struct value whose fields are -themselves symbolic: - -``` -$include 20-38 code/structs.saw -``` - -This is a rather tedious process, however, as we had to repeatedly use -`mir_fresh_var` to create a fresh, symbolic value for each field. Moreover, -because `mit_fresh_var` does not work for structs, we had to recursively apply -this process in order to create a fresh `Foo` value. It works, but it takes a -lot of typing to accomplish. - -To make this process less tedious, SAW offers a `mir_fresh_expanded_value` -command that allows one to create symbolic values of many more types. While -`mir_fresh_var` is limited to those MIR types that can be directly converted to -Cryptol, `mir_fresh_expanded_value` can create symbolic structs by automating -the process of creating fresh values for each field. This process also applies -recursively for struct fields, such as the `Foo` field in `Bar`. - -As an example, a much shorter way to write the spec above using -`mir_fresh_expanded_value` is: - -``` -$include 42-48 code/structs.saw -``` - -That's it! Note that the string `"b"` is used as a prefix for all fresh names -that `mir_fresh_expanded_value` generates, so if SAW produces a counterexample -involving this symbolic struct value, one can expect to see names such as -`b_0`, `b_1`, etc. for the fields of the struct. - -`mir_fresh_expanded_value` makes it easier to construct large, compound values -that consist of many smaller, inner values. The drawback is that you can't -refer to these inner values in the postconditions of a spec. As a result, there -are some functions for which `mir_fresh_expanded_value` isn't suitable, so keep -this in mind before reaching for it. - -### Enum types - -Besides structs, another form of ADT that Rust supports are enums. Each enum -has a number of different _variants_ that describe the different ways that an -enum value can look like. A famous example of a Rust enum is the `Option` type, -which is defined by the standard library like so: - -``` {.rs} -enum Option { - None, - Some(T), -} -``` - -`Option` is commonly used in Rust code to represent a value that may be present -(`Some`) or absent (`None`). For this reason, we will use `Option` as our -motivating example of an enum in this section. - -First, let's start by defining some functions that make use of `Option`'s -variants: - -``` {.rs} -$include 1-7 code/enums.rs -``` - -Both functions return an `Option` value, but each function returns a -different variant. In order to tell these variants apart, we need a SAW -function which can construct an enum value that allows the user to pick which -variant they want to construct. The `mir_enum_value function does exactly that: - -``` -sawscript> :type mir_enum_value -MIRAdt -> String -> [MIRValue] -> MIRValue -``` - -Like `mir_struct_value`, `mir_enum_value` also requires a `MIRAdt` argument in -order to discern which particular enum you want. Unlike `mir_struct_value`, -however, it also requires a `String` which variant of the enum you want. In the -case of `Option`, this `String` will either be `"None"` or `"Some"`. Finally, -the `[MIRValue]` arguments represent the fields of the enum variant. - -Let's now verify some enum-related code with SAW. First, we must look up the -`Option` ADT, which works just as if you had a struct type: - -``` -$include 5-5 code/enums.saw -``` - -Next, we can use this ADT to construct enum values. We shall use -`mir_enum_value` to create a `Some` value in the spec for `i_found_something`: - -``` -$include 7-16 code/enums.saw -``` - -Note that while we used the full identifier `core::option::Option` to look up -the `Option` ADT, we do not need to use the `core::option` prefix when -specifying the `"Some"` variant. This is because SAW already knows what the -prefix should be from the `option_u32` ADT, so the `"Some"` shorthand suffices. - -Similarly, we can also write a spec for `i_got_nothing`, which uses the `None` -variant: - -``` -$include 18-25 code/enums.saw -``` - -#### Symbolic enums - -In order to create a symbolic struct, one could create symbolic fields and pack -them into a larger struct value using `mir_struct_value`. The same process is -not possible with `mir_enum_value`, however, as a symbolic enum value would -need to range over _all_ possible variants in an enum. - -Just as `mir_fresh_expanded_value` supports creating symbolic structs, -`mir_fresh_expanded_value` also supports creating symbolic enum values. For -example, given this function that accepts an `Option` value as an -argument: - -``` {.rs} -$include 9-11 code/enums.rs -``` - -We can write a spec for this function that considers all possible `Option` -values like so: - -``` -$include 27-33 code/enums.saw -``` - -Here, `o` can be a `None` value, or it can be a `Some` value with a symbolic -field. - -### Slices - -Slices are a particular type of reference that allow referencing contiguous -sequences of elements in a collection, such as an array. Unlike ordinary -references (e.g., `&u32`), SAW does not permit allocating a slice directly. -Instead, one must take a slice of an existing reference. To better illustrate -this distinction, consider this function: - -``` {.rs} -$include 1-3 code/slices.rs -``` - -`sum_of_prefix` takes a slice to a sequence of `u32`s as an argument, indexes -into the first two elements in the sequence, and adds them together. There are -many possible ways we can write a spec for this function, as the slice argument -may be backed by many different sequences. For example, the slice might be -backed by an array whose length is exactly two: - -``` {.rs} -$include 6-8 code/slices.rs -``` - -We could also make a slice whose length is longer than two: - -``` {.rs} -$include 10-12 code/slices.rs -``` - -Alternatively, the slice might be a subset of an array whose length is longer -than two: - -``` {.rs} -$include 14-16 code/slices.rs -``` - -All of these are valid ways of building the slice argument to `sum_of_prefix`. -Let's try to write SAW specifications that construct these different forms of -slices. To do so, we will need SAW functions that take a reference to a -collection (e.g., an array) and converts them into a slice reference. The -`mir_slice_value` function is one such function: - -``` -sawscript> :type mir_slice_value -MIRValue -> MIRValue -``` - -`mir_slice_value arr_ref` is the SAW equivalent of writing `arr_ref[..]`. That -is, if `arr_ref` is of type `&[T; N]`, then `mir_slice_value arr_ref` is of -type `&[T]`. Note that `arr_ref` must be a _reference_ to an array, not an -array itself. - -Let's use `mir_slice_value` to write a spec for `sum_of_prefix` when the slice -argument is backed by an array of length two: - -``` -$include 5-15 code/slices.saw -``` - -The first part of this spec allocates an array reference `a_ref` and declares -that it points to a fresh array value `a_val`. The next part declares a slice -`s` that is backed by the entirety of `a_ref`, which is then passed as an -argument to the function itself. Finally, the return value is declared to be -the sum of the first and second elements of `a_val`, which are the same values -that back the slice `s` itself. - -As noted above, the `sum_of_prefix` function can work with slices of many -different lengths. Here is a slight modification to this spec that declares it -to take a slice of length 5 rather than a slice of length 2: - -``` -$include 19-29 code/slices.saw -``` - -Both of these examples declare a slice whose length matches the length of the -underlying array. In general, there is no reason that these have to be the -same, and it is perfectly fine for a slice's length to be less than the the -length of the underlying array. In Rust, for example, we can write a slice of a -subset of an array by writing `&arr_ref[0..2]`. The SAW equivalent of this can -be achieved with the `mir_slice_range_value` function: - -``` -sawscript> :type mir_slice_range_value -MIRValue -> Int -> Int -> MIRValue -``` - -`mir_slice_range_value` takes takes two additional `Int` arguments that -represent (1) the index to start the slice from, and (2) the index at which the -slice ends. For example, `mir_slice_range_value arr_ref 0 2` creates a slice -that is backed by the first element (index `0`) and the second element (index -`1`) of `arr_ref`. Note that the range `[0..2]` is half-open, so this range -does _not_ include the third element (index `2`). - -For example, here is how to write a spec for `sum_of_prefix` where the slice is -a length-2 subset of the original array: - -``` -$include 33-43 code/slices.saw -``` - -Note that both `Int` arguments to `mir_slice_range_value` must be concrete -(i.e., not symbolic). (See the section below if you want an explanation for why -they are not allowed to be symbolic.) - -#### Aside: slices of arbitrary length - -After reading the section about slices above, one might reasonably wonder: is -there a way to write a more general spec for `sum_of_prefix`: that covers all -possible slice lengths `n`, where `n` is greater than or equal to 2? In this -case, the answer is "no". - -This is a fundamental limitation of the way SAW's symbolic execution works. The -full reason for why this is the case is somewhat technical (keep reading if you -want to learn more), but the short answer is that if SAW attempts to -simulate code whose length is bounded by a symbolic integer, then SAW will go -into an infinite loop. To avoid this pitfall, the `mir_slice_range_value` -function very deliberately requires the start and end values to be concrete -integers, as allowing these values to be symbolic would allow users to -inadvertently introduce infinite loops in their specifications. - -A longer answer as to why SAW loops forever on computations that are bounded by -symbolic lengths: due to the way SAW's symblolic execution works, it creates a -complete model of the behavior of a function for all possible inputs. The way -that SAW achieves this is by exploring all possible execution paths through a -program. If a program involves a loop, for example, then SAW will unroll all -iterations of the loop to construct a model of the loop's behavior. Similarly, -if a sequence (e.g., a slice or array) has an unspecified length, then SAW must -consider all possible lengths of the array. - -SAW's ability to completely characterize the behavior of all paths through a -function is one of its strengths, as this allows it to prove theorems that -other program verification techniques would not. This strength is also a -weakness, however. If a loop has a symbolic number of iterations, for example, -then SAW will spin forever trying to unroll the loop. Similarly, if a slice -were to have a symbolic length, then SAW would spin forever trying to simulate -the program for all possible slice lengths. - -In general, SAW cannot prevent users from writing programs whose length is -bounded by a symbolic value. For now, however, SAW removes one potential -footgun by requiring that slice values always have a concrete length. - -## Overrides and compositional verification - -Up until this point, all uses of `mir_verify` in this tutorial have provided an -empty list (`[]`) of overrides. This means that any time SAW has simulated a -function which calls another function, it will step into the definition of the -callee function and verify its behavior alongside the behavior of the callee -function. This is a fine thing to do, but it can be inefficient. For example, -consider a function like this: - -``` {.rs} -$include 5-9 code/overrides.rs -``` - -Here, the caller function `f` invokes the callee function `g` three separate -times. If we verify `f` with `mir_verify` as we have done up until this point, -then SAW must analyze the behavior of `g` three separate times. This is -wasteful, and especially so if `g` is a large and complicated function. - -This is where _compositional verification_ enters the picture. The idea behind -compositional verification is that when we prove properties of a caller -function, we can reuse properties that we have already proved about callee -functions. These properties are captured as _override specifications_, which -are also referred to by the shorthand term _overrides_. When a caller invokes a -callee with a corresponding override specification, the override's properties -are applied without needing to re-simulate the entire function. - -As it turns out, the command needed to produce an override specification is -already familiar to us—it's `mir_verify`! If you examine the type of this -command: - -``` -sawscript> :type mir_verify -MIRModule -> String -> [MIRSpec] -> Bool -> MIRSetup () -> ProofScript () -> TopLevel MIRSpec -``` - -The returned value is a `MIRSpec`, which captures the behavior of the function -that was verified as an override spec. This override can then be passed to -another call to `mir_verify` to use as part of a larger proof. - -Let's now try compositional verification in practice. To do so, we will first -prove a spec for the `g` function above. For demonstration purposes, we will -pick a simplistic implementation of `g`: - -``` {.rs} -$include 1-3 code/overrides.rs -``` - -Note that we don't really _have_ to use compositional verification when `g` is -this simple, as SAW is capable of reasoning about `g`'s behavior directly when -proving a spec for `f`. It's still worth going along with this exercise, -however, as the same principles of compositional verification apply whether the -implementation of `g` is small or large. - -The first step of compositional verification is to prove a spec for `g`, the -callee function: - -``` -$include 5-13 code/overrides.saw -``` - -There's nothing that different about this particular proof from the proofs -we've seen before. The only notable difference is that we bind the result of -calling `mir_verify` to a `MIRSpec` value that we name `g_ov` (short for "`g` -override"). This part is important, as we will need to use `g_ov` shortly. - -The next step is to write a spec for `f`. Since `g` adds `1` to its argument, -`f` will add `3` to its argument: - -``` -$include 15-21 code/overrides.saw -``` - -Again, nothing too surprising. Now let's prove `f` against `f_spec` by using -`g_ov` as a compositional override: - -``` -$include 23-23 code/overrides.saw -``` - -Here, note that instead of passing an empty list (`[]`) as we have done before, -we now pass a list containing `g_ov`. This informs `mir_verify` that whenever -it simulates a call to `g`, it should reuse the properties captured in `g_ov`. -In general, we can pass as many overrides as we want (we will see examples of -this later in the tutorial), but for now, one override will suffice. - -Let's run the proof of `f` against `f_spec`, making sure to pay attention to -the output of SAW: - -``` -[19:06:17.392] Verifying overrides/96c5af24::f[0] ... -[19:06:17.406] Simulating overrides/96c5af24::f[0] ... -[19:06:17.407] Matching 1 overrides of overrides/96c5af24::g[0] ... -[19:06:17.407] Branching on 1 override variants of overrides/96c5af24::g[0] ... -[19:06:17.407] Applied override! overrides/96c5af24::g[0] -[19:06:17.407] Matching 1 overrides of overrides/96c5af24::g[0] ... -[19:06:17.407] Branching on 1 override variants of overrides/96c5af24::g[0] ... -[19:06:17.407] Applied override! overrides/96c5af24::g[0] -[19:06:17.407] Matching 1 overrides of overrides/96c5af24::g[0] ... -[19:06:17.407] Branching on 1 override variants of overrides/96c5af24::g[0] ... -[19:06:17.407] Applied override! overrides/96c5af24::g[0] -[19:06:17.407] Checking proof obligations overrides/96c5af24::f[0] ... -[19:06:17.422] Proof succeeded! overrides/96c5af24::f[0] -``` - -We've now proven `f` compositionally! The first two lines ("`Verifying ...`" -and "`Simulating ...`") and the last two lines ("`Checking proof obligations -...`" and "`Proof succeeded! ..."`) are the same as before, but this time, we -have some additional lines of output in between: - -* Whenever SAW prints "`Matching overrides of `", that's when you - know that SAW is about to simulate a call to ``. At that point, SAW - will check to see how many overrides (``) for `` are available. -* Whenever SAW prints "`Brancing on override variants of ", SAW is - trying to figure out which of the `` overrides to apply. In this example, - there is only a single override, so the choice is easy. In cases where there - are multiple overrides, however, SAW may have to work harder (possibly even - consulting an SMT solver) to figure out which override to use. -* If SAW successfully picks an override to apply, it will print - "`Applied override! ...`". - -In the example above, we used a single `g` override that applies for all -possible arguments. In general, however, there is no requirement that overrides -must work for all arguments. In fact, it is quite common for SAW verification -efforts to write different specifications for the same function, but with -different arguments. We can then provide multiple overrides for the same -function as part of a compositional verification, and SAW will be able to pick -the right override depending on the shape of the argument when invoking the -function being overridden. - -For example, let's suppose that we wrote different `g` specs, one where the -argument to `g` is even, and another where the argument to `g` is odd: - -``` -$include 25-44 code/overrides.saw -``` - -We can then prove `f` compositionally by passing both of the `g` overrides to -`mir_verify`: - -``` -$include 45-45 code/overrides.saw -``` - -Like before, this will successfully verify. The only different now is that SAW -will print output involving two overrides instead of just one: - -``` -[20:48:07.649] Simulating overrides/96c5af24::f[0] ... -[20:48:07.650] Matching 2 overrides of overrides/96c5af24::g[0] ... -[20:48:07.650] Branching on 2 override variants of overrides/96c5af24::g[0] ... -[20:48:07.652] Applied override! overrides/96c5af24::g[0] -... -``` - -Keep in mind that if you provide at least one override for a function as part -of a compositional verification, then SAW _must_ apply an override whenever it -invokes that function during simulation. If SAW cannot find a matching -override, then the verification will fail. For instance, consider what would -happen if you tried proving `f` like so: - -``` -$include 33-33 code/overrides-fail.saw -``` - -This time, we supply one override for `g` that only matches when the argument -is even. This is a problem, as SAW will not be able to find a matching override -when the argument is odd. Indeed, SAW will fail to verify this: - -``` -[20:53:29.588] Verifying overrides/96c5af24::f[0] ... -[20:53:29.602] Simulating overrides/96c5af24::f[0] ... -[20:53:29.602] Matching 1 overrides of overrides/96c5af24::g[0] ... -[20:53:29.602] Branching on 1 override variants of overrides/96c5af24::g[0] ... -[20:53:29.603] Applied override! overrides/96c5af24::g[0] -[20:53:29.603] Matching 1 overrides of overrides/96c5af24::g[0] ... -[20:53:29.603] Branching on 1 override variants of overrides/96c5af24::g[0] ... -[20:53:29.604] Applied override! overrides/96c5af24::g[0] -[20:53:29.604] Matching 1 overrides of overrides/96c5af24::g[0] ... -[20:53:29.604] Branching on 1 override variants of overrides/96c5af24::g[0] ... -[20:53:29.605] Applied override! overrides/96c5af24::g[0] -[20:53:29.605] Symbolic simulation completed with side conditions. -[20:53:29.606] Checking proof obligations overrides/96c5af24::f[0] ... -[20:53:29.623] Subgoal failed: overrides/96c5af24::f[0] No override specification applies for overrides/96c5af24::g[0]. -Arguments: -- c@26:bv -Run SAW with --sim-verbose=3 to see a description of each override. -[20:53:29.623] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 388} -[20:53:29.624] ----------Counterexample---------- -[20:53:29.624] x: 1 -... -Proof failed. -``` - -Here, we can see that `No override specification applies`, and SAW also -generates a counterexample of `x: 1`. Sure enough, `1` is an odd number! - -### Overrides and mutable references - -Compositional overrides provide great power, as they effectively allow you to -skip over certain functions when simulating them and replace them with simpler -implementations. With great power comes great responsibility, however. In -particular, one must be careful when using overrides for functions that modify -mutable references. If an override does not properly capture the behavior of a -mutable reference, it could potentially lead to incorrect proofs. - -This is the sort of thing that is best explained with an example, so consider -these two functions: - -``` {.rs} -$include 1-9 code/overrides-mut.rs -``` - -The `side_effect` function does not return anything interesting; it is only -ever invoked to perform a side effect of changing the mutable reference `a` to -point to `0`. The `foo` function invokes `side_effect`, and as a result, it -will always return `0`, regardless of what the argument to `foo` is. No -surprises just yet. - -Now let's make a first attempt at verifying `foo` using compositional -verification. First, we will write a spec for `side_effect`: - -``` -$include 5-10 code/overrides-mut-fail.saw -``` - -`side_effect_spec` is somewhat odd. Although it goes through the effort of -allocating a mutable reference `a_ref` and initializing it, nothing about this -spec states that `a_ref` will point to `0` after the function has been invoked. -This omission is strange, but not outright wrong—the spec just underspecifies -what the behavior of the function is. Indeed, SAW will successfully verify this -spec using `mir_verify`: - -``` -$include 18-18 code/overrides-mut-fail.saw -``` - -Next, let's try to write a spec for `foo`: - -``` -$include 12-16 code/overrides-mut-fail.saw -``` - -At this point, alarm bells should be going off in your head. This spec -incorrectly states that `foo(x)` should return `x`, but it should actually -return `0`! This looks wrong, but consider what would happen if you tried to -verify this compositionally using our `side_effect_ov` override: - -``` -$include 19-19 code/overrides-mut-fail.saw -``` - -If SAW were to simulate `foo(x)`, it would invoke create a temporary variable -`b` and assign it to the value `x`, and then it would invoke `side_effect(&mut -b)`. At this point, the `side_effect_ov` override would apply. According to -`side_effect_spec`, the argument to `side_effect` is not modified at all after -the function returns. This means that when the `foo` function returns `b`, it -will still retain its initial value of `x`. This shows that if we were to use -`side_effect_ov`, we could prove something that's blatantly false! - -Now that we've made you sweat a little bit, it's time for some good news: SAW -won't _actually_ let you prove `foo_spec`. If you try this compositional proof -in practice, SAW will catch your mistake: - -``` -[14:50:29.170] Verifying overrides_mut/11e47708::foo[0] ... -[14:50:29.181] Simulating overrides_mut/11e47708::foo[0] ... -[14:50:29.181] Matching 1 overrides of overrides_mut/11e47708::side_effect[0] ... -[14:50:29.181] Branching on 1 override variants of overrides_mut/11e47708::side_effect[0] ... -... -State of memory allocated in precondition (at overrides-mut-fail.saw:6:12) not described in postcondition -``` - -The line of code that SAW points to in the "`State of memory ...`" error -message is: - -``` -$include 6-6 code/overrides-mut-fail.saw -``` - -SAW informs us that although we allocated the mutable reference `a_ref`, we -never indicated what it should point to after the function has returned. This -is an acceptable (if somewhat unusual) thing to do when verifying -`side_effect_spec` using `mir_verify`, but it is _not_ acceptable to do this -when using this spec as an override. To avoid unsound behavior like what is -described above, any override that allocates a mutable reference in its -preconditions _must_ declare what its value should be in the postconditions, no -exceptions. - -Thankfully, repairing this spec is relatively straightforward. Simply add a -`mir_points_to` statement in the postconditions of `side_effect_spec`: - -``` -$include 5-13 code/overrides-mut.saw -``` - -Then use the correct return value in `foo_spec`: - -``` -$include 15-21 code/overrides-mut.saw -``` - -And now the compositional proof of `foo_spec` works! - -### Unsafe overrides - -Now that we've made it this far into the tutorial, it's time to teach you a -more advanced technique: _unsafe_ overrides. Up until this point, we have -relied on SAW to check all of our work, and this is usually what you'd want -from a formal verification tool. In certain circumstances, however, it can be -useful to say "I know what I'm doing, SAW—just believe me when I say this spec -is valid!" In order to say this, you can use `mir_unsafe_assume_spec`: - -``` -sawscript> :type mir_unsafe_assume_spec -MIRModule -> String -> MIRSetup () -> TopLevel MIRSpec -``` - -`mir_unsafe_assume_spec` is `mir_verify`'s cousin who likes to live a little -more dangerously. Unlike `mir_verify`, the specification that you pass to -`mir_unsafe_assume_spec` (the `MIRSetup ()` argument) is _not_ checked for full -correctness. That is, `mir_unsafe_assume_spec` will bypass SAW's usual symbolic -execution pipeline, which is why one does not need to pass a `ProofScript` -argument (e.g., `z3`) to `mir_unsafe_assume_spec`. SAW will believe whatever -spec you supply `mir_unsafe_assume_spec` to be valid, and the `MIRSpec` that -`mir_unsafe_assume_spec` returns can then be used in later compositional -verifications. - -Why would you want to do this? The main reason is that writing proofs can be -difficult, and sometimes, there are certain functions in a SAW verification -effort that are disproportionately harder to write a spec for than others. It -is tempting to write specs for each function in sequence, but this can run the -risk of getting stuck on a particularly hard-to-verify function, blocking -progress on other parts of the proofs. - -In these situations, `mir_unsafe_assume_spec` can be a useful prototyping tool. -One can use `mir_unsafe_assume_spec` to assume a spec for the hard-to-verify -function and then proceed with the remaining parts of the proof. Of course, you -should make an effort to go back and prove the hard-to-verify function's spec -later, but it can be nice to try something else first. - -For example, here is how one can unsafely assume `g_spec` and use it in a -compositional proof of `f_spec`: - -``` -$include 21-22 code/overrides-unsafe.saw -``` - -It should be emphasized that when we say "`unsafe`", we really mean it. -`mir_unsafe_assume_spec` can be used to prove specs that are blatantly wrong, -so use it with caution. - -## Static items - -Sometimes, Rust code makes use of [_static -items_](https://doc.rust-lang.org/reference/items/static-items.html), which are -definitions that are defined in a precise memory location for the entire -duration of the program. As such, static items can be thought of as a form of -global variables. - -### Immutable static items - -There are two kinds of static items in Rust: mutable static items (which have a -`mut` keyword) and immutable static items (which lack `mut`). Immutable static -items are much easier to deal with, so let's start by looking at an example of -a program that uses immutable static data: - -``` {.rs} -$include 1-5 code/statics.rs -``` - -This function will return `ANSWER`, i.e., `42`. We can write a spec that says -as much: - -``` -$include 5-9 code/statics.saw -``` - -This works, but it is somewhat unsatisfying, as it requires hard-coding the -value of `ANSWER` into the spec. Ideally, we'd not have to think about the -precise implementation of static items like `ANSWER`. Fortunately, SAW makes -this possible by providing a `mir_static_initializer` function which computes -the initial value of a static item at the start of the program: - -``` -sawscript> :type mir_static_initializer -String -> MIRValue -``` - -In this case, `mir_static_initializer "statics::ANSWER"` is equivalent to -writing `mir_term {{ 42 : [32] }}`, so this spec is also valid: - -``` -$include 14-18 code/statics.saw -``` - -Like `mir_verify`, the `mir_static_initializer` function expects a full -identifier as an argument, so we must write `"statics::ANSWER"` instead of -just `"ANSWER". - -At the MIR level, there is a unique reference to every static item. You can -obtain this reference by using the `mir_static` function: - -``` -sawscript> :type mir_static -String -> MIRValue -``` - -Here is one situation in which you would need to use a _reference_ to a static -item (which `mir_static` computes) rather than the _value_ of a static item -(which `mir_static_initializer` computes): - -``` {.rs} -$include 7-9 code/statics.rs -``` - -A spec for this function would look like this: - -``` -$include 23-27 code/statics.saw -``` - -That's about all there is to say regarding immutable static items. - -### Mutable static items - -Mutable static items are a particularly tricky aspect of Rust. Including a -mutable static item in your program is tantamount to having mutable global -state that any function can access and modify. They are so tricky, in fact, -that Rust does not even allow you to use them unless you surround them in an -`unsafe` block: - -``` {.rs} -$include 11-15 code/statics.rs -``` - -The `mir_static_initializer` and `mut_static` functions work both immutable and -mutable static items, so we can write specs for mutable items using mostly the -same techniques as for writing specs for immutable items. We must be careful, -however, as SAW is pickier when it comes to specifying the initial values of -mutable static items. For example, here is naïve attempt at porting the spec -for `answer_to_the_ultimate_question` over to its mutable static counterpart, -`mut_answer_to_the_ultimate_question`: - -``` -$include 5-9 code/statics-fail.saw -``` - -This looks plausible, but SAW will fail to verify it: - -``` -[21:52:32.738] Verifying statics/28a97e47::mut_answer_to_the_ultimate_question[0] ... -[21:52:32.745] Simulating statics/28a97e47::mut_answer_to_the_ultimate_question[0] ... -... -Symbolic execution failed. -Abort due to assertion failure: - statics.rs:14:14: 14:24: error: in statics/28a97e47::mut_answer_to_the_ultimate_question[0] - attempted to read empty mux tree -``` - -Oh no! Recall that we have seen this "`attempted to read empty mux tree`" error -message once before when discussing reference types. This error arose when we -attempted to read from uninitialized memory from a reference value. The same -situation applies here. A static item is backed by a reference, and SAW -deliberately does _not_ initialize the memory that a mutable static reference -points to upon program startup. Since we did not initialize `MUT_ANSWER`'s -reference value in the preconditions of the spec, SAW crashes at simulation -time when it attempts to read from the uninitialized memory. - -The solution to this problem is to perform this initialization explicitly using -`mir_points_to` in the preconditions of the spec. For example, this is a valid -spec: - -``` -$include 32-40 code/statics-fail.saw -``` - -We don't necessarily have to use `mir_static_initializer` as the starting value -for `MUT_ANSWER`, however. This spec, which uses `27` as the starting value, is -equally valid: - -``` -$include 45-52 code/statics-fail.saw -``` - -At this point, you are likely wondering: why do we need to explicitly -initialize mutable static references but not immutable static references? After -all, when we wrote a spec for `answer_to_the_ultimate_question` earlier, we -managed to get away with not initializing the reference for `ANSWER` (which is -an immutable static item). The difference is that the value of a mutable static -item can change over the course of a program, and SAW requires that you be very -careful in specifying what a mutable static value is at the start of a -function. For example, consider a slightly extended version of the earlier Rust -code we saw: - -``` {.rs} -$include 11-22 code/statics.rs -``` - -Suppose someone were to ask you "what value does -`mut_answer_to_the_ultimate_question` return?" This is not a straightforward -question to answer, as the value that it returns depends on the surrounding -context. If you were to call `mut_answer_to_the_ultimate_question` right as the -program started, it would return `42`. If you were to call -`mut_answer_to_the_ultimate_question` as part of the implementation of -`alternate_universe`, however, then it would return `27`! This is an inherent -danger of using mutable static items, as they can be modified at any time by -any function. For this reason, SAW requires you to be explicit about what the -initial values of mutable static items should be. - -#### Mutable static items and compositional overrides - -In the "Overrides and mutable references" section, we discussed the potential -pitfalls of using mutable references in compositional overrides. Mutable static -items are also mutable values that are backed by references, and as such, they -are also subject to the same pitfalls. Let's see an example of this: - -``` {.rs} -$include 1-12 code/statics-compositional.rs -``` - -The setup is almost the same, except that instead of passing a mutable -reference as an argument to `side_effect`, we instead declare a mutable static -item `A` that is shared between `side_effect` and `foo`. We could potentially -write SAW specs for `side_effect` and `foo` like these: - -``` -$include 5-20 code/statics-compositional-fail.saw -``` - -Note that we have once again underspecified the behavior of `side_effect`, as -we do not say what `A`'s value should be in the postconditions of -`side_effect_spec`. Similarly, `foo_spec` is wrong, as it should return `0` -rather than the initial value of `A`. By similar reasoning as before, we run -the risk that using `side_effect_ov` could lead use to prove something -incorrect. Thankfully, SAW can also catch this sort of mistake: - -``` -[15:46:38.525] Verifying statics_compositional/16fea9c0::side_effect[0] ... -[15:46:38.533] Simulating statics_compositional/16fea9c0::side_effect[0] ... -[15:46:38.533] Checking proof obligations statics_compositional/16fea9c0::side_effect[0] ... -[15:46:38.533] Proof succeeded! statics_compositional/16fea9c0::side_effect[0] -[15:46:38.533] Verifying statics_compositional/16fea9c0::foo[0] ... -[15:46:38.542] Simulating statics_compositional/16fea9c0::foo[0] ... -[15:46:38.542] Matching 1 overrides of statics_compositional/16fea9c0::side_effect[0] ... -[15:46:38.542] Branching on 1 override variants of statics_compositional/16fea9c0::side_effect[0] ... -... -State of mutable static variable "statics_compositional/16fea9c0::A[0]" not described in postcondition -``` - -To repair this proof, add a `mir_points_to` statement in the postconditions of -`side_effect_spec`: - -``` -$include 7-13 code/statics-compositional.saw -``` - -And then correct the behavior of `foo_spec`: - -``` -$include 15-22 code/statics-compositional.saw -``` - -Be warned that if your program declares any mutable static items, then any -compositional override _must_ state what the value of each mutable static item -is in its postconditions. This applies _even if the override does not directly -use the mutable static items_. For example, if we had declared a second mutable -static item alongside `A`: - -``` {.rs} -static mut A: u32 = 42; -static mut B: u32 = 27; -``` - -Then `side_effect_spec` would need an additional `mir_points_to` statement -involving `B` to satisfy this requirement. This requirement is somewhat -heavy-handed, but it is necessary in general to avoid unsoundness. Think -carefully before you use mutable static items! - -## Case study: Salsa20 - -If you've made it this far into the tutorial, congrats! You've now been exposed -to all of the SAW fundamentals that you need to verify Rust code found in the -wild. Of course, talking about verifying real-world code is one thing, but -actually _doing_ the verification is another thing entirely. Making the jump -from the small examples to "industrial-strength" code can be intimidating. - -To make this jump somewhat less frightening, the last part of this tutorial -will consist of a case study in using SAW to verify a non-trivial piece of Rust -code. In particular, we will be looking at a Rust implementation of the -[Salsa20](https://en.wikipedia.org/wiki/Salsa20) stream cipher. We do not -assume any prior expertise in cryptography or stream ciphers in this tutorial, -so don't worry if you are not familiar with Salsa20. - -More than anything, this case study is meant to emphasize that verification is -an iterative process. It's not uncommon to try something with SAW and encounter -an error message. That's OK! We will explain what can go wrong when verifying -Salsa20 and how to recover from these mistakes. Later, if you encounter similar -issues when verifying your own code with SAW, the experience you have developed -when developing these proofs can inform you of possible ways to fix the issues. - -### The `salsa20` crate - -The code for this Salsa20 implementation we will be verifying can be found -under the -[`code/salsa20`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20) -subdirectory. This code is adapted from version 0.3.0 of the `salsa20` crate, -which is a part of the -[`stream-ciphers`](https://github.com/RustCrypto/stream-ciphers) project. The -code implements Salsa20 as well as variants such as HSalsa20 and XSalsa20, but -we will only be focusing on the original Salsa20 cipher in this tutorial. - -The parts of the crate that are relevant for our needs are mostly contained in -the -[`src/core.rs`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/src/core.rs) -file, as well as some auxiliary definitions in the -[`src/rounds.rs`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/src/rounds.rs) -and -[`src/lib.rs`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/src/lib.rs) -files. You can take a look at these files if you'd like, but you don't need to -understand everything in them just yet. We will introduce the relevant parts of -the code in the tutorial as they come up. - -### Salsa20 preliminaries - -Salsa20 is a stream cipher, which is a cryptographic technique for encrypting -and decrypting messages. A stream cipher encrypts a message by combining it -with a _keystream_ to produce a ciphertext (the encrypted message). Moreover, -the same keystream can then be combined with the ciphertext to decrypt it back -into the original message. - -The original author of Salsa20 has published a specification for Salsa20 -[here](https://cr.yp.to/snuffle/spec.pdf). This is a great starting point for a -formal verification project, as this gives us a high-level description of -Salsa20's behavior that will guide us in proving the functional correctness of -the `salsa20` crate. When we say that `salsa20` is functionally correct, we -really mean "proven correct with respect to the Salsa20 specification". - -The first step in our project would be to port the Salsa20 spec to Cryptol -code, as we will need to use this code when writing SAW proofs. The process of -transcribing an English-language specification to executable Cryptol code is -interesting in its own right, but it is not the primary focus of this tutorial. -As such, we will save you some time by providing a pre-baked Cryptol -implementation of the Salsa20 spec -[here](https://github.com/GaloisInc/saw-script/blob/master/doc/rust-tutorial/code/salsa20/Salsa20.cry). -(This implementation is -[adapted](https://github.com/GaloisInc/cryptol-specs/blob/1366ccf71db9dca58b16ff04ca7d960a4fe20e34/Primitive/Symmetric/Cipher/Stream/Salsa20.cry) -from the [`cryptol-specs`](https://github.com/GaloisInc/cryptol-specs) repo.) - -Writing the Cryptol version of the spec is only half the battle, however. We -still have to prove that the Rust implementation in the `salsa20` crate adheres -to the behavior prescribed by the spec, which is where SAW enters the picture. -As we will see shortly, the code in `salsa20` is not a direct port of the -pseudocode shown in the Salsa20 spec, as it is somewhat more low-level. SAW's -role is to provide us assurance that the behavior of the low-level Rust code -and the high-level Cryptol code coincide. - -### A note about cryptographic security - -As noted in the previous section, our goal is to prove that the behavior of -`salsa20` functions is functionally correct. This property should _not_ be -confused with cryptographic security. While functional correctness is an -important aspect of cryptographic security, a full cryptographic security audit -would encompass additional properties such as whether the code runs in constant -time on modern CPUs. As such, the SAW proofs we will write would not constitute -a full security audit (and indeed, the [`salsa20` -`README`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/README.md) -states that the crate has never received such an audit). - -### An overview of the `salsa20` code - -Before diving into proofs, it will be helpful to have a basic understanding of -the functions and data types used in the `salsa20` crate. Most of the -interesting code lives in -[`src/core.rs`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/src/core.rs). -At the top of this file, we have the `Core` struct: - -``` {.rs} -$include 8-14 code/salsa20/src/core.rs -``` - -Let's walk through this: - -* The `state` field is an array that is `STATE_WORDS` elements long, where - `STATE_WORDS` is a commonly used alias for `16`: - - ``` {.rs} - $include 88-89 code/salsa20/src/lib.rs - ``` - -* The `rounds` field is of type `PhantomData`. If you haven't seen it - before, - [`PhantomData`](https://doc.rust-lang.org/std/marker/struct.PhantomData.html) - is a special type that tells the Rust compiler to pretend as though the - struct is storing something of type `R`, even though a `PhantomData` value - will not take up any space at runtime. - -The reason that `Core` needs a `PhantomData` field is because `R` -implements the `Rounds` trait: - -``` {.rs} -$include 1-5 code/salsa20/src/rounds.rs -``` - -A core operation in Salsa20 is hashing its input through a series of -_rounds_. The `COUNT` constant indicates how many rounds should be performed. -The Salsa20 spec assumes 20 rounds: - -``` {.rs} -$include 23-29 code/salsa20/src/rounds.rs -``` - -However, there are also reduced-round variants that perform 8 and 12 rounds, -respectively: - -``` {.rs} -$include 7-21 code/salsa20/src/rounds.rs -``` - -Each number of rounds has a corresponding struct whose names begins with the -letter `R`. For instance, a `Core` value represents a 20-round Salsa20 -cipher. Here is the typical use case for a `Core` value: - -* A `Core` value is created using the `new` function: - - ``` {.rs} - $include 18-18 code/salsa20/src/core.rs - ``` - - We'll omit the implementation for now. This function takes a secret `Key` - value and a unique `Nonce` value and uses them to produce the initial `state` - in the `Core` value. - -* After creating a `Core` value, the `counter_setup` and `rounds` functions are - used to produce the Salsa20 keystream: - - ``` {.rs} - $include 83-83 code/salsa20/src/core.rs - ``` - ``` {.rs} - $include 90-90 code/salsa20/src/core.rs - ``` - - We'll have more to say about these functions later. - -* The _pièce de résistance_ is the `apply_keystream` function. This takes a - newly created `Core` value, produces its keystream, and applies it to a - message to produce the `output`: - - ``` {.rs} - $include 68-68 code/salsa20/src/core.rs - ``` - -Our ultimate goal is to verify the `apply_keystream` function, which is the -Rust equivalent of the Salsa20 encryption function described in the spec. - -### Building `salsa20` - -The next step is to build the `salsa20` crate. Unlike the examples we have seen -up to this point, which have been self-contained Rust files, `salsa20` is a -`cargo`-based project. As such, we will need to build it using `cargo -saw-build`, an extension to the `cargo` package manager that integrates with -`mir-json`. Before you proceed, make sure that you have defined the -`SAW_RUST_LIBRARY_PATH` environment variable as described in [this -section](#the-saw_rust_library_path-environment-variable). - -To build the `salsa20` crate, perform the following steps: - -``` -$ cd code/salsa20/ -$ cargo saw-build -``` - -Near the end of the build output, you will see a line that looks like this: - -``` -linking 9 mir files into <...>/saw-script/doc/rust-tutorial/code/salsa20/target/x86_64-unknown-linux-gnu/debug/deps/salsa20-dd0d90f28492b9cb.linked-mir.json -``` - -This is the location of the MIR JSON file that we will need to provide to SAW. -(When we tried it, the hash in the file name was `dd0d90f28492b9cb`, but it -could very well be different on your machine.) Due to how `cargo` works, the -location of this file is in a rather verbose, hard-to-remember location. For -this reason, we recommend copying this file to a different path, e.g., - -``` -$ cp <...>/saw-script/doc/rust-tutorial/code/salsa20/target/x86_64-unknown-linux-gnu/debug/deps/salsa20-dd0d90f28492b9cb.linked-mir.json code/salsa20/salsa20.linked-mir.json -``` - -As a safeguard, we have also checked in a compressed version of this MIR JSON -file as -[`code/salsa20/salsa/salsa20.linked-mir.json.tar.gz`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/salsa20.linked-mir.json.tar.gz). -In a pinch, you can extract this archive to obtain a copy of the MIR JSON file, -which is approximately 4.6 megabytes when uncompressed. - -### Getting started with SAW - -Now that we've built the `salsa20` crate, it's time to start writing some -proofs! Let's start a new `code/salsa20/salsa20.saw` file as fill it in with -the usual preamble: - -``` -$include 1-3 code/salsa20/salsa20-reference.saw -``` - -We are also going to need to make use of the Cryptol implementation of the -Salsa20 spec, which is defined in -[`code/salsa20/Salsa20.cry`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/Salsa20.cry). -SAW allows you to import standalone Cryptol `.cry` files by using the `import` -command: - -``` -$include 4-4 code/salsa20/salsa20-reference.saw -``` - -As an aside, note that we have also checked in a -[`code/salsa20/salsa20-reference.saw`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/salsa20-reference.saw), -which contains a complete SAW file. We encourage you _not_ to look at this file -for now, since following along with the tutorial is meant to illustrate the -"a-ha moments" that one would have in the process of writing the proofs. In you -become stuck while following along and absolutely need a hint, however, then -this file can help you become unstuck. - -### Verifying our first `salsa20` function - -Now it's time to start verifying some `salsa20` code. But where do we start? -It's tempting to start with `apply_keystream`, which is our end goal. This is -likely going to be counter-productive, however, as `apply_keystream` is a -large function with several moving parts. Throwing SAW at it immediately is -likely to cause it to spin forever without making any discernible progress. - -For this reason, we will instead take the approach of working from the -bottom-up. That is, we will first verify the functions that `apply_keystream` -transitively invokes, and then leverage compositional verification to verify a -proof of `apply_keystream` using overrides. This approach naturally breaks up -the problem into smaller pieces that are easier to understand in isolation. - -If we look at the implementation of `apply_keystream`, we see that it invokes -the `round` function, which in turn invokes the `quarter_round` function: - -``` {.rs} -$include 122-142 code/salsa20/src/core.rs -``` - -`quarter_round` is built on top of the standard library functions -[`wrapping_add`](https://doc.rust-lang.org/std/primitive.usize.html#method.wrapping_add) -and -[`rotate_left`](https://doc.rust-lang.org/std/primitive.usize.html#method.rotate_left), -so we have finally reached the bottom of the call stack. This makes -`quarter_round` a good choice for the first function to verify. - -The implementation of the Rust `quarter_round` function is quite similar to the -Cryptol `quarterround` function in `Salsa20.cry`: - -``` -$include 10-16 code/salsa20/Salsa20.cry -``` - -The Cryptol `quarterround` function doesn't have anything like the `state` -argument in the Rust `quarter_round` function, but let's not fret about that -too much yet. Our SAW spec is going to involve `quarterround` _somehow_—we just -have to figure out how to make it fit. - -Let's start filling out the SAW spec for `quarter_round`: - -``` -$include 6-6 code/salsa20/salsa20-quarter_round-fail1.saw -``` - -We are going to need some fresh variables for the `a`, `b`, `c`, and `d` -arguments: - -``` -$include 7-10 code/salsa20/salsa20-quarter_round-fail1.saw -``` - -We will also need to allocate a reference for the `state` argument. The -reference's underlying type is `STATE_WORDS` (`16`) elements long: - -``` -$include 11-13 code/salsa20/salsa20-quarter_round-fail1.saw -``` - -Finally, we will need to pass these arguments to the function: - -``` -$include 15-21 code/salsa20/salsa20-quarter_round-fail1.saw -``` - -With that, we have a spec for `quarter_round`! It's not very interesting just -yet, as we don't specify what `state_ref` should point to after the function -has returned. But that's fine for now. When developing a SAW proof, it can be -helpful to first write out the "skeleton" of a function spec that only contains -the call to `mir_execute_func`, without any additional preconditions or -postconditions. We can add those later after ensuring that the skeleton works -as expected. - -Let's check our progress thus far by running this through SAW: - -``` -$ saw salsa20.saw -... -[23:16:05.080] Type errors: - salsa20.saw:12:39-12:68: Unbound variable: "STATE_WORDS" (salsa20.saw:12:49-12:60) -Note that some built-in commands are available only after running -either `enable_deprecated` or `enable_experimental`. - - salsa20/salsa20.saw:11:31-11:60: Unbound variable: "STATE_WORDS" (salsa20.saw:11:41-11:52) -Note that some built-in commands are available only after running -either `enable_deprecated` or `enable_experimental`. -``` - -We've already run into some type errors. Not too surprising, considering this -was our first attempt. The error message contains that `STATE_WORDS` is -unbound. This makes sense if you think about it, as `STATE_WORDS` is defined in -the Rust code, but not in the SAW file itself. Let's fix that by adding this -line to `salsa20.saw`: - -``` -$include 6-6 code/salsa20/salsa20-quarter_round-fail2.saw -``` - -That change fixes the type errors in `quarter_round_spec`. Hooray! Let's press -on. - -Next, we need to add a call to `mir_verify`. In order to do this, we need to -know what the full identifier for the `quarter_round` function is. Because it -is defined in the `salsa20` crate and in the `core.rs` file, so we would expect -the identifier to be named `salsa20::core::quarter_round`: - -``` -$include 25-26 code/salsa20/salsa20-quarter_round-fail2.saw -``` - -However, SAW disagrees: - -``` -[00:22:56.970] Stack trace: -"mir_verify" (salsa20.saw:26:3-26:13) -Couldn't find MIR function named: salsa20::core::quarter_round -``` - -Ugh. This is a consequence of how `mir-json` disambiguates identifiers. Because -there is a separate `core` crate in the Rust standard libraries, `mir-json` -uses "`core#1`", a distinct name, to refer to the `core.rs` file. You can see -this for yourself by digging around in the MIR JSON file, if you'd like. (In a -future version of SAW, one will be able to [look this name -up](https://github.com/GaloisInc/saw-script/issues/1980) more easily.) - -Once we change the identifier: - -``` -$include 25-26 code/salsa20/salsa20-quarter_round-fail3.saw -``` - -We can run SAW once more. This time, SAW complains about a different thing: - -``` -[01:00:19.697] Verifying salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[01:00:19.714] Simulating salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[01:00:19.717] Checking proof obligations salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[01:00:19.739] Subgoal failed: salsa20/10e438b3::core#1[0]::quarter_round[0] index out of bounds: the length is move _10 but the index is _9 -[01:00:19.739] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 53} -[01:00:19.739] ----------Counterexample---------- -[01:00:19.739] a: 2147483648 -``` - -Here, SAW complains that we have an `index out of bounds`. Recall that we are -indexing into the `state` array, which is of length 16, using the -`a`/`b`/`c`/`d` arguments. Each of these arguments are of type `usize`, and -because we are declaring these to be symbolic, it is quite possible for each -argument to be 16 or greater, which would cause the index into `state` to be -out of bounds. - -In practice, however, the only values of `a`/`b`/`c`/`d` that we will use are -less than 16. We can express this fact as a precondition: - -``` -$include 13-16 code/salsa20/salsa20-quarter_round-fail4.saw -``` - -That is enough to finally get SAW to verify this very stripped-down version of -`quarter_round_spec`. Some good progress! But we aren't done yet, as we don't -yet say what happens to the value that `state` points to after the function -returns. This will be a requirement if we want to use `quarter_round_spec` in -compositional verification (and we do want this), so we should address this -shortly. - -Recall that unlike the Rust `quarter_round` function, the Cryptol -`quarterround` function doesn't have a `state` argument. This is because the -Rust function does slightly more than what the Cryptol function does. The Rust -function will look up elements of the `state` array, use them to perform the -computations that the Cryptol function does, and then insert the new values -back into the `state` array. To put it another way: the Rust function can be -thought of as a wrapper around the Cryptol function that also performs an -in-place bulk update of the `state` array. - -In Cryptol, one can look up elements of an array using the `(@@)` function, -and one can perform in-place array updates using the `updates` function. -This translates into a postcondition that looks like this: - -``` -$include 28-30 code/salsa20/salsa20-quarter_round-fail4.saw -``` - -What does SAW think of this? Someone surprisingly, SAW finds a counterexample: - -``` -[01:43:30.065] Verifying salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[01:43:30.078] Simulating salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[01:43:30.084] Checking proof obligations salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[01:43:30.801] Subgoal failed: salsa20/10e438b3::core#1[0]::quarter_round[0] Literal equality postcondition - -[01:43:30.801] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 1999} -[01:43:30.802] ----------Counterexample---------- -[01:43:30.802] a: 13 -[01:43:30.802] b: 3 -[01:43:30.802] c: 0 -[01:43:30.802] d: 0 -[01:43:30.802] state: [3788509705, 0, 0, 3223325776, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1074561051, 0, 0] -``` - -Note that in this counterexample, the values of `c` and `d` are the same. In -the Rust version of the function, each element in `state` is updated -sequentially, so if two of the array indices are the same, then the value that -was updated with the first index will later be overwritten by the value at the -later index. In the Cryptol version of the function, however, all of the -positions in the array are updated simultaneously. This implicitly assumes that -all of the array indices are disjoint from each other, an assumption that we -are not encoding into `quarter_round_spec`'s preconditions. - -At this point, it can be helpful to observe _how_ the `quarter_round` function -is used in practice. The call sites are found in the `rounds` function: - -``` {.rs} -$include 92-102 code/salsa20/src/core.rs -``` - -Here, we can see that the values of `a`/`b`/`c`/`d` will only ever be chosen -from a set of eight possible options. We can take advantage of this fact to -constrain the possible set of values for `a`/`b`/`c`/`d`. The latest iteration -of the `quarter_round_spec` is now: - -``` -$include 10-38 code/salsa20/salsa20-reference.saw -``` - -Note that: - -* The `indices` value is constrained (via a precondition) to be one of the set - of values that is chosen in the `rounds` function. (Note that `\/` is the - logical-or function in Cryptol.) Each of these are concrete values that are - less than `STATE_WORDS` (`16`), so we no longer need a precondition stating - `a < `STATE_WORDS /\ ...`. - -* Because we now reference `indices` in the preconditions, we have moved its - definition up. (Previously, it was defined in the postconditions section.) - -With this in place, will SAW verify `quarter_round_spec` now? - -``` -[02:14:02.037] Verifying salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[02:14:02.051] Simulating salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[02:14:02.057] Checking proof obligations salsa20/10e438b3::core#1[0]::quarter_round[0] ... -[02:14:18.616] Proof succeeded! salsa20/10e438b3::core#1[0]::quarter_round[0] -``` - -At long last, it succeeds. Hooray! SAW does have to think for a while, however, -as this proof takes about 17 seconds to complete. It would be unfortunate to -have to wait 17 seconds on every subsequent invocation of SAW, and since we -still have other functions to verify, this is a very real possibility. For this -reason, it can be helpful to temporarily turn this use of `mir_verify` into a -`mir_unsafe_assume_spec`: - -``` -$include 40-44 code/salsa20/salsa20-reference.saw -``` - -Once we are done with the entire proof, we can come back and remove this use of -`mir_unsafe_assume_spec`, as we're only using it as a time-saving measure. - -### Verifying the `rounds` function - -Now that we've warmed up, let's try verifying the `rounds` function, which is -where `quarter_round` is invoked. Here is the full definition of `rounds`: - -``` {.rs} -$include 90-108 code/salsa20/src/core.rs -``` - -First, `rounds` performs `COUNT` rounds on the `state` argument. After this, it -takes each element of `self.state` and adds it to the corresponding element in -`state`. - -Linking back at the Salsa20 spec, we can see that the `rounds` function is -_almost_ an implementation of the Salsa20(x) hash function. The only notable -difference is that while the Salsa20(x) hash function converts the results to -little-endian form, the `rounds` function does not. `Salsa20.cry` implements -this part of the spec here: - -``` -$include 139-147 code/salsa20/Salsa20.cry -``` - -Where `Salsa20` is the hash function, and `Salsa20_rounds` is the part of the -hash function that excludes the little-endian conversions. In other words, -`Salsa20_rounds` precisely captures the behavior of the Rust `rounds` function. - -One aspect of the `rounds` function that will make verifying it slightly -different from verifying `quarter_rounds` is that `rounds` is defined in an -`impl` block for the `Core` struct. This means that the `&mut self` argument in -`rounds` has the type `&mut Core`. As such, we will have to look up the -`Core` ADT in SAW using `mir_find_adt`. - -This raises another question, however: when looking up `Core`, what type -should we use to instantiate `R`? As noted above, our choices are `R8`, `R12`, -and `R20`, depending on how many rounds you want. For now, we'll simply -hard-code it so that `R` is instantiated to be `R8`, but we will generalize -this a bit later. - -Alright, enough chatter—time to start writing a proof. First, let's look up the -`R8` ADT. This is defined in the `salsa20` crate in the `rounds.rs` file, so -its identifier becomes `salsa20::rounds::R8`: - -``` -$include 46-46 code/salsa20/salsa20-rounds-take-1.saw -``` - -Next, we need to look up the `PhantomData` ADT, which is used in the -`rounds` field of the `Core` struct. This is defined in `core::marker`: - -``` -$include 47-47 code/salsa20/salsa20-rounds-take-1.saw -``` - -Finally, we must look up `Core` itself. Like `quarter_round`, the `Core` -struct` is defined in `salsa20::core#1`: - -``` -$include 48-48 code/salsa20/salsa20-rounds-take-1.saw -``` - -Now that we have the necessary prerequisites, let's write a spec for the -`rounds` function. First, we need to allocate a reference for the `self` -argument: - -``` -$include 50-51 code/salsa20/salsa20-rounds-take-1.saw -``` - -Next, we need to create symbolic values for the fields of the `Core` struct, -which `self_ref` will point to. The `self.state` field will be a fresh array, -and the `self.rounds` field will be a simple, empty struct value: - -``` -$include 52-53 code/salsa20/salsa20-rounds-take-1.saw -``` - -Finally, putting all of the `self` values together: - -``` -$include 54-55 code/salsa20/salsa20-rounds-take-1.saw -``` - -Next, we need a `state` argument (not to be confused with the `self.state` -field in `Core`). This is handled much the same as it was in -`quarter_round_spec`: - -``` -$include 57-59 code/salsa20/salsa20-rounds-take-1.saw -``` - -Lastly, we cap it off with a call to `mir_execute_func`: - -``` -$include 61-62 code/salsa20/salsa20-rounds-take-1.saw -``` - -(Again, we're missing some postconditions describing what `self_ref` and -`state_ref` point to after the function returns, but we'll return to that in a -bit.) - -If we run SAW at this point, we see that everything in `rounds_spec` -typechecks, so we're off to a good start. Let's keep going and add a -`mir_verify` call. - -Here, we are faced with an interesting question: what is the identifier for -`rounds::`? The `rounds` function is defined using generics, so we can't -verify it directly—we must instead verify a particular instantiation of -`rounds`. At present, there isn't a good way to look up what the identifiers -for instantiations of generic functions are (there [will be in the -future](https://github.com/GaloisInc/saw-script/issues/1980)), but it turns out -that the identifier for `rounds::` is this: - -``` -$include 64-65 code/salsa20/salsa20-rounds-take-1.saw -``` - -Note that we are using `quarter_round_ov` as a compositional override. Once -again, SAW is happy with our work thus far: - -``` -[03:12:35.990] Proof succeeded! salsa20/10e438b3::core#1[0]::{impl#0}[0]::rounds[0]::_inst6e4a2d7250998ef7[0] -``` - -Nice. Now let's go back and fill in the missing postconditions in -`rounds_spec`. In particular, we must declare what happens to both `self_ref` -and `state_ref`. A closer examination of the code in the Rust `rounds` function -reveals that the `self` argument is never modified at all, so that part is -easy: - -``` -$include 63-63 code/salsa20/salsa20-rounds-take-2.saw -``` - -The `state` argument, on the other hand, is modified in-place. This time, our -job is made easier by the fact that `Salsa20_rounds` implements _exactly_ what -we need. Because we are instantiating `rounds` at type `R8`, we must explicitly -state that we are using 8 rounds: - -``` -$include 64-64 code/salsa20/salsa20-rounds-take-2.saw -``` - -Once again, SAW is happy with our work. We're on a roll! - -Now let's address the fact that we are hard-coding everything to `R8`, which is -somewhat uncomfortable. We can make things better by allowing the user to -specify the number of rounds. The first thing that we will need to change is -the `r_adt` definition, which is response for looking up `R8`. We want to turn -this into a function that, depending on the user input, will look up `R8`, -`R12`, or `R20`: - -``` -$include 46-46 code/salsa20/salsa20-rounds-take-3.saw -``` - -Where `str_concat` is a SAW function for concatenating strings together: - -``` -sawscript> :type str_concat -String -> String -> String -``` - -We also want to parameterize `phantom_data_adt` and `core_adt`: - -``` -$include 47-48 code/salsa20/salsa20-rounds-take-3.saw -``` - -Next, we need to parameterize `rounds_spec` by the number of rounds. This will -require changes in both the preconditions and postconditions. On the -preconditions side, we must pass the number of rounds to the relevant -functions: - -``` -$include 50-56 code/salsa20/salsa20-rounds-take-3.saw -``` - -And on the postconditions side, we must pass the number of rounds to the -Cryptol `Salsa20_rounds` function: - -``` -$include 66-67 code/salsa20/salsa20-rounds-take-3.saw -``` - -Finally, we must adjust the call to `rounds_spec` in the context of -`mir_verify` so that we pick `8` as the number of rounds: - -``` -$include 69-70 code/salsa20/salsa20-rounds-take-3.saw -``` - -SAW is happy with this generalization. To demonstrate that we have generalized -things correctly, we can also verify the same function at `R20` instead of -`R8`: - -``` -$include 71-72 code/salsa20/salsa20-rounds-take-3.saw -``` - -The only things that we had to change were the identifier and the argument to -`rounds_spec`. Not bad! - -### Verifying the `counter_setup` function - -We're very nearly at the point of being able to verify `apply_keystream`. -Before we do, however, there is one more function that `apply_keystream` calls, -which we ought to verify first: `counter_setup`. Thankfully, the implementation -of `counter_setup` is short and sweet: - -``` {.rs} -$include 83-86 code/salsa20/src/core.rs -``` - -This updates the elements of the `state` array at indices `8` and `9` with the -lower 32 bits and the upper 32 bits of the `counter` argument, respecitvely. -At a first glance, there doesn't appear to be any function in `Salsa20.cry` -that directly corresponds to what `counter_setup` does. This is a bit of a -head-scratcher, but the answer to this mystery will become more apparent as we -get further along in the proof. - -For now, we should take matters into our own hands and write our own Cryptol -spec for `counter_setup`. To do this, we will create a new Cryptol file named -`Salsa20Extras.cry`, which imports from `Salsa20.cry`: - -``` -$include 1-3 code/salsa20/Salsa20Extras.cry -``` - -The Cryptol implementation of `counter_setup` will need arrays of length -`STATE_WORDS`, so we shall define `STATE_WORDS` first: - -``` -$include 5-5 code/salsa20/Salsa20Extras.cry -``` - -Note that we preceded this definition with the `type` keyword. In Cryptol, -sequence lengths are encoded at the type level, so if we want to use -`STATE_WORDS` at the type level, we must declare it as a `type`. - -Finally, we can write a Cryptol version of `counter_setup` using our old friend -`updates` to perform a bulk sequence update: - -``` -$include 8-10 code/salsa20/Salsa20Extras.cry -``` - -Note that `counter` is a 64-bit word, but the elements of the `state` sequence -are 32-bit words. As a result, we have to explicitly truncate `counter` and -`counter >> 32` to 32-bit words by using the `drop` function, which drops the -first 32 bits from each word. - -Returning to `salsa20.saw`, we must now make use of our new Cryptol file by -`import`ing it at the top: - -``` -$include 5-5 code/salsa20/salsa20-reference.saw -``` - -With the `counter_setup` Cryptol implementation in scope, we can now write -a spec for the Rust `counter_setup` function. There's not too much to remark -on here, as the spec proves relatively straightforward to write: - -``` -$include 74-90 code/salsa20/salsa20-reference.saw -``` - -We can now verify `counter_setup` against `counter_setup_spec` at lengths `8` -and `20`: - -``` -$include 92-95 code/salsa20/salsa20-reference.saw -``` - -That wasn't so bad. It's a bit unsatisfying that we had to resort to writing a -Cryptol function not found in `Salsa20.cry`, but go along with this for now—it -will become apparent later why this needed to be done. - -### Verifying the `apply_keystream` function (first attempt) - -It's time. Now that we've verified `rounds` and `counter_setup`, it's time to -tackle the topmost function in the call stack: `apply_keystream`: - -``` {.rs} -$include 68-80 code/salsa20/src/core.rs -``` - -There aren't _that_ many lines of code in this function, but there is still -quite a bit going on. Let's walk through `apply_keystream` in more detail: - -1. The `output` argument represents the message to encrypt (or decrypt). - `output` is a slice of bytes, so in principle, `output` can have an arbitrary - length. That being said, the first line of `apply_keystream`'s implementation - checks that `output`'s length is equal to `BLOCK_SIZE`: - - ``` {.rs} - $include 69-69 code/salsa20/src/core.rs - ``` - - Where `BLOCK_SIZE` is defined here: - - ``` {.rs} - $include 82-83 code/salsa20/src/lib.rs - ``` - - So in practice, `output` must have exactly 64 elements. - -2. Next, `apply_keystream` invokes the `counter_setup` and `rounds` functions - to set up the keystream (the local `state` variable). - -3. Finally, `apply_keystream` combines the keystream with `output`. It does - so by chunking `output` into a sequence of 4 bytes, and then it XOR's the - value of each byte in-place with the corresponding byte from the keystream. - This performs little-endian conversions as necessary. - -The fact that we are XOR'ing bytes strongly suggests that this is an -implementation of the Salsa20 encryption function from the spec. There is an -important difference between how the Salsa20 spec defines the encryption -function versus how `apply_keystream` defines it, however. In the Salsa20 spec, -encryption is a function of a key, nonce, and a message. `apply_keystream`, on -the other hand, is a function of `self`'s internal state, a counter, and a -message. The two aren't _quite_ the same, which is makes it somewhat tricky to -describe one in terms of the other. - -`Salsa20.cry` defines a straightforward Cryptol port of the Salsa20 encryption -function from the spec, named `Salsa20_encrypt`. Because it takes a key and a -nonce as an argument, it's not immediately clear how we'd tie this back to -`apply_keystream`. But no matter: we can do what we did before and define our -own Cryptol version of `apply_keystream` in `Salsa20Extras.cry`: - -``` -$include 12-17 code/salsa20/Salsa20Extras.cry -``` - -This implementation builds on top of the Cryptol `counter_setup` and -`Salsa20_rounds` functions, which we used as the reference implementations for -the Rust `counter_setup` and `rounds` functions, respectively. We also make -sure to define a `BLOCK_SIZE` type alias at the top of the file: - -``` -$include 6-6 code/salsa20/Salsa20Extras.cry -``` - -Now let's write a SAW spec for `apply_keystream`. Once again, we will need to -reference `BLOCK_SIZE` when talking about the `output`-related parts of the -spec, so make sure to define `BLOCK_SIZE` at the top of the `.saw` file: - -``` -$include 8-8 code/salsa20/salsa20-reference.saw -``` - -First, we need to declare all of our arguments, which proceeds as you would -expect: - -``` -$include 120-136 code/salsa20/salsa20-reference.saw -``` - -What about the postconditions? We have two mutable references to contend with: -`self_ref` and `output_ref`. The postcondition for `self_ref` is fairly -straightforward: the only time it is ever modified is when `counter_setup` is -called. This means that after the `apply_keystream` function has returned, -`self_ref` will point to the results of calling the `counter_setup` Cryptol -function: - -``` -$include 138-140 code/salsa20/salsa20-reference.saw -``` - -`output_ref` is where the interesting work happenings. After the Rust -`apply_keystream` function has returned, it will point to the results of -calling the Cryptol `apply_keystream` function that we just defined: - -``` -$include 141-142 code/salsa20/salsa20-reference.saw -``` - -Finally, we can put this all together and verify `apply_keystream` against -`apply_keystream_spec` at lengths `8` and `20`: - -``` -$include 144-148 code/salsa20/salsa20-reference.saw -``` - -SAW will successfully verify these. We've achieved victory... or have we? -Recall that we had to tailor the Cryptol `apply_keystream` function to -specifically match the behavior of the corresponding Rust code. This makes the -proof somewhat underwhelming, since the low-level implementation is nearly -identical to the high-level spec. - -A more impressive proof would require linking `apply_keystream` to a Cryptol -function in the `Salsa20.cry` file, which was developed independently of the -Rust code. As we mentioned before, however, doing so will force us to reconcile -the differences in the sorts of arguments that each function takes, as -`apply_keystream` doesn't take a key or nonce argument. Time to think for a -bit. - -### Verifying the `new_raw` function - -At this point, we should ask ourselves: _why_ doesn't `apply_keystream` take a -key or nonce argument? The reason lies in the fact that the `salsa20` crate -implements Salsa20 in a stateful way. Specifically, the `Core` struct stores -internal state that is used to compute the keystream to apply when hashing. In -order to use this internal state, however, we must first initialize it. The -`new` function that is responsible for this initialization: - -``` {.rs} -$include 17-20 code/salsa20/src/core.rs -``` - -Sure enough, this function takes a key and a nonce as an argument! This is a -critical point that we overlooked. When using the `salsa20` crate, you wouldn't -use the `apply_keystream` function in isolation. Instead, you would create an -initial `Core` value using `new`, and _then_ you would invoke -`apply_keystream`. The Salsa20 spec effectively combines both of these -operations in is encryption function, whereas the `salsa20` splits these two -operations into separate functions altogether. - -Strictly speaking, we don't need to verify `new` in order to verify -`apply_keystream`, as the latter never invokes the former. Still, it will be a -useful exercise to verify `new`, as the insights we gain when doing so will -help us write a better version of `apply_keystream_spec`. - -All that being said, we probably to verify `new_raw` (a lower-level helper -function) rather than `new` itself. This is because the definitions of `Key` -and `Nonce` are somewhat involved. For instance, `Key` is defined as: - -``` {.rs} -$include 27-27 code/salsa20/src/salsa.rs -``` - -[`GenericArray`](https://docs.rs/generic-array/latest/generic_array/struct.GenericArray.html) -is a somewhat complicated abstraction. Luckily, we don't really _need_ to deal -with it, since `new_raw` deals with simple array references rather than -`GenericArray`s: - -``` {.rs} -$include 22-23 code/salsa20/src/core.rs -``` - -The full implementation of `new_raw` is rather long, so we won't inline the -whole thing here. At a high level, it initializes the `state` array of a `Core` -value by populating each element of the array with various things. Some -elements of the array are populated with `key`, some parts are populated with -`iv` (i.e., the nonce), and other parts are populated with an array named -`CONSTANTS`: - -``` {.rs} -$include 91-92 code/salsa20/src/lib.rs -``` - -The comment about `"expand 32-byte k"` is a strong hint that `new_raw` is -implementing a portion of the Salsa20 expansion function from the spec. (No -really, the spec literally says to use the exact string `"expand 32-byte -k"`—look it up!) The `Salsa20.cry` Cryptol file has an implementation of this -portion of the expansion function, which is named `Salsa20_init`: - -``` -$include 182-189 code/salsa20/Salsa20.cry -``` - -Note that we were careful to say a _portion_ of the Salsa20 expansion function. -There is also a Cryptol implementation of the full expansion function, named -`Salsa20_expansion`: - -``` -$include 179-180 code/salsa20/Salsa20.cry -``` - -This calls `Salsa20_init` followed by `Salsa20`, the latter of which performs -hashing. Importantly, `new_raw` does _not_ do any hashing on its own, just -initialization. For this reason, we want to use `Salsa20_init` as the reference -implementation of `new_raw`, not `Salsa20_expansion`. - -Alright, time to write a SAW spec. The first part of the spec is straightforward: - -``` -$include 97-106 code/salsa20/salsa20-new_raw-fail1.saw -``` - -As is usually the case, the postconditions are the tricky part. We know that -the behavior of `new_raw` will roughly coincide with the `Salsa20_init` -function, so let's try that first: - -``` -$include 108-112 code/salsa20/salsa20-new_raw-fail1.saw -``` - -If we attempt to verify this using `mir_verify`: - -``` -$include 115-118 code/salsa20/salsa20-new_raw-fail1.saw -``` - -SAW complains thusly: - -``` -Cryptol error: -[error] at salsa20.saw:109:45--109:54: - Type mismatch: - Expected type: 16 - Inferred type: 8 - Context: [ERROR] _ - When checking type of 2nd tuple field -``` - -Here, the 2nd tuple field is the `nonce_arr` in `Salsa20_init(key_arr, -nonce_arr)`. And sure enough, `Salsa20_init` expects the 2nd tuple field to be -a sequence of 16 elements, but `nonce_arr` only has 8 elements. Where do we get -the remaining 8 elements from? - -The answer to this question can be found by looking at the implementation of -`new_raw` more closely. Let's start at this code: - -``` {.rs} -$include 35-36 code/salsa20/src/core.rs -``` - -This will chunk up `iv` (the nonce) into two 4-byte chunks and copies them over -to the elements of `state` array at indices `6` and `7`. This is immediately -followed by two updates at indices `8` and `9`, which are updated to be `0`: - -``` {.rs} -$include 39-40 code/salsa20/src/core.rs -``` - -If you take the two 4-bytes chunks of `iv` and put two 4-byte `0` values after -them, then you would have a total of 16 bytes. This suggests that the nonce -value that `Salsa20_init` expects is actually this: - -``` -nonce_arr # zero : [16][8] -``` - -Where `zero : [8][8]` is a Cryptol expression that returns all zeroes, and -`(#)` is the Cryptol operator for concatenating two sequences together. Let's -update `new_raw_spec` to reflect this: - -``` -$include 109-109 code/salsa20/salsa20-new_raw-fail2.saw -``` - -This is closer to what we want, but not quite. SAW still complains: - -``` -could not match specified value with actual value: - ... - type of actual value: [u32; 16] - type of specified value: [u8; 64] -``` - -This is because `Salsa20_init` returns something of type `[64][8]`, which -corresponds to the Rust type `[u8; 64]`. `self.state`, on the other hand, is of -type `[u32; 16]`. These types are very close, as they both contain the same -number of bytes, but they are chunked up differently. Recall the code that -copies the nonce value over to `self.state`: - -``` {.rs} -$include 35-36 code/salsa20/src/core.rs -``` - -In order to resolve the type differences between `iv` and `state`, this code -needed to explicitly convert `iv` to little-endian form using the -[`u32::from_le_bytes`](https://doc.rust-lang.org/std/primitive.u32.html#method.from_le_bytes) -function. There is a similar Cryptol function in `Salsa20.cry` named -`littleendian_state`: - -``` -$include 131-132 code/salsa20/Salsa20.cry -``` - -Note that `[64][8]` is the Cryptol equivalent of `[u8; 64]`, and `[16][32]` is -the Cryptol equivalent of `[u32; 16]`. As such, this is exactly the function -that we need to resolve the differences in types: - -``` -$include 109-109 code/salsa20/salsa20-reference.saw -``` - -With that change, SAW is finally happy with `new_raw_spec` and successfully -verifies it. - -There is an interesting connection between the `new_raw` and `counter_setup` -functions. Both functions perform in-place updates on `state` at indices `8` -and `9`. Whereas `new_raw` always sets these elements of `state` to `0`, -`counter_setup` will set them to the bits of the `counter` argument (after -converting `counter` to little-endian form). This means that if you invoke -`counter_setup` right after `new_raw`, then `counter_setup` would overwrite the -`0` values with the `counter` argument. In order words, it would be tantamount -to initializing `state` like so: - -``` -littleendian_state (Salsa20_init(key, nonce # littleendian_inverse counter)) -``` - -Where `littleendian_inverse` (a sibling of `littleendian_state`) converts a -`[64]` value to a `[8][8]` one. This pattern is a curious one... - -### Verifying the `apply_keystream` function (second attempt) - -Let's now return to the problem of linking `apply_keystream` up to -`Salsa20_encrypt`. In particular, let's take a closer look at the definition of -`Salsa20_encrypt` itself: - -``` -$include 198-201 code/salsa20/Salsa20.cry -``` - -Does anything about this definition strike you as interesting? Take a look at -the `v#(littleendian_inverse i)` part—we _just_ saw a use of -`littleendian_inverse` earlier in our discussion about initializing the -`state`! Moreover, `v` is the nonce argument, so it is becoming clearer that -`Sals20_encrypt` is creating an initial state is much the same way that -`new_raw` is. - -A related question: what is the `i` value? The answer is somewhat technical: -the Salsa20 encryption function is designed to work with messages with -differing numbers of bytes (up to `2^^70` bytes, to be exact). Each 8-byte -chunk in the message will be encrypted with a slightly difference nonce. For -instance, the first 8-byte chunk's nonce will have its lower 32 bits set to -`0`, the second 8-byte chunk's nonce will have its lower 32 bits set to `1`, -and so on. In general, the `i`th 8-byte chunk's nonce will have its lower 32 -bits set to `i`, and this corresponds exactly to the `i` in the expression -`littleendian_inverse i`. - -Note, however, that `apply_keystream` only ever uses a message that consists of -exactly eight 8-byte chunks. This means that `Salsa20_encrypt` will only ever -invoke `Salsa20_expansion` once with a nonce value where the lower 32 bits are -set to `0`. That is, it will perform encryption with an initial state derived -from: - -``` -Salsa20_init(k, v#(littleendian_inverse zero)) -``` - -Which can be further simplified to `Salsa20_init(k, v # zero)`. This is very -nearly what we want, as this gives us the behavior of the Rust `new_raw` -function. There's just one problem though: it doesn't take the behavior of -`counter_setup` into account. How do we go from `zero` to `littleendian_inverse -counter`? - -While `Salsa20_encrypt` doesn't take counters into account at all, it is not -too difficult to generalize `Salsa20_encrypt` in this way. There is a variant -of `Salsa20_encrypt` in the same file named `Salsa20_encrypt_with_offset`, -where the offset argument `o` serves the same role that `counter` does in -`counter_setup`: - -``` -$include 191-196 code/salsa20/Salsa20.cry -``` - -(Observe that `Salsa20_encrypt(count, k, v, m)` is equivalent to -`Salsa20_encrypt_with_offset(count, k, v, 0, m)`.) - -At long last, we have discovered the connection between `apply_keystream` and -the Salsa20 spec. If you assume that you invoke `new_raw` beforehand, then the -behavior of `apply_keystream` corresponds exactly to that of -`Salsa20_encrypt_with_offset`. This insight will inform us how to write an -alternative SAW spec for `apply_keystream`: - -``` -$include 149-160 code/salsa20/salsa20-apply_keystream_alt-fail1.saw -``` - -Observe the following differences between `apply_keystream_alt_spec` and our -earlier `apply_keystream_spec`: - -1. In `apply_keystream_alt_spec`, we declare fresh `key` and `nonce` values, - which weren't present at all in `apply_keystream_spec`. - -2. In `apply_keystream_alt_spec`, we no longer make `self_state` a fresh, - unconstrained value. Instead, we declare that it must be the result of - calling `Salsa20_init` on the `key`, `nonce`, and `counter` values. This - is the part that encodes the assumption that `new_raw` was invoked - beforehand. - -The parts of the spec relating to `output` remain unchanged: - -``` -$include 162-167 code/salsa20/salsa20-apply_keystream_alt-fail1.saw -``` - -The postconditions are slightly different in `apply_keystream_alt_spec`. While -the parts relating to `self_ref` remain unchanged, we now have `output_ref` -point to the results of calling `Salsa20_encrypt_with_offset`: - -``` -$include 169-172 code/salsa20/salsa20-apply_keystream_alt-fail1.saw -``` - -Tying this all together, we call `mir_verify`, making sure to use compositional -overrides involving `counter_setup` and `rounds`: - -``` -$include 175-178 code/salsa20/salsa20-apply_keystream_alt-fail1.saw -``` - -At long last, it is time to run SAW on this. When we do, we see this: - -``` -[15:11:44.576] Checking proof obligations salsa20/10e438b3::core#1[0]::{impl#0}[0]::apply_keystream[0]::_inst6e4a2d7250998ef7[0] ... -``` - -After this, SAW loops forever. Oh no! While somewhat disheartening, this is a -reality of SMT-based verification that we must content with. SMT solvers are -extremely powerful, but their performance can sometimes be unpredictable. The -task of verifying `apply_keystream_alt_spec` is _just_ complicated enough that -Z3 cannot immediately figure out that the proof is valid, so it resorts to much -slower algorithms to solve proof goals. - -We could try waiting for Z3 to complete, but we'd be waiting for a long time. -It's not unheard of for SMT solvers to take many hours on especially hard -problems, but we don't have that many hours to spare. We should try a slightly -different approach instead. - -When confronted with an infinite loop in SAW, there isn't a one-size-fits-all -solution that will cure the problem. Sometimes, it is worth stating your SAW -spec in a slightly different way such that the SMT solver can spot patterns -that it couldn't before. Other times, it can be useful to try and break the -problem up into smaller functions and use compositional verification to handle -the more complicated subfunctions. As we mentioned before, the performance of -SMT solvers in unpredictable, and it's not always obvious what the best -solution is. - -In this example, however, the problem lies with Z3 itself. As it turns out, -Yices (a different SMT solver) _can_ spot the patterns needed to prove -`apply_keystream_alt_spec` immediately. Fortunately, SAW includes support for -both Z3 and Yices. In order to switch from Z3 to Yices, swap out the `z3` proof -script with `yices`: - -``` -$include 175-178 code/salsa20/salsa20-reference.saw -``` - -After doing this, SAW is leverage Yices to solve the proof goals almost -immediately: - -``` -[15:22:00.745] Proof succeeded! salsa20/10e438b3::core#1[0]::{impl#0}[0]::apply_keystream[0]::_instfa33e77d840484a0[0] -``` - -And with that, we're finally done! You've successfully completed a non-trivial -SAW exercise in writing some interesting proofs. Give yourself a well-deserved -pat on the back. - -The process of developing these proofs was bumpy at times, but that is to be -expected. You very rarely get a proof correct on the very first try, and when -SAW doesn't accept your proof, it is important to be able to figure out what -went wrong and how to fix it. This is a skill that takes some time to grow, but -with enough time and experience, you will be able to recognize common pitfalls. -This case study showed off some of these pitfalls, but there are likely others. - -## A final word - -Like everything else in the `saw-script` repo, this tutorial is being -maintained and developed. If you see something in the tutorial that is wrong, -misleading, or confusing, please file an issue about it -[here](https://github.com/GaloisInc/saw-script/issues). diff --git a/doc/rust-verification-with-saw/a-final-word.md b/doc/rust-verification-with-saw/a-final-word.md new file mode 100644 index 0000000000..a88c8f36a5 --- /dev/null +++ b/doc/rust-verification-with-saw/a-final-word.md @@ -0,0 +1,6 @@ +# A final word + +Like everything else in the `saw-script` repo, this tutorial is being +maintained and developed. If you see something in the tutorial that is wrong, +misleading, or confusing, please file an issue about it +[here](https://github.com/GaloisInc/saw-script/issues). diff --git a/doc/rust-verification-with-saw/about-mir-json.md b/doc/rust-verification-with-saw/about-mir-json.md new file mode 100644 index 0000000000..ce0576c6ed --- /dev/null +++ b/doc/rust-verification-with-saw/about-mir-json.md @@ -0,0 +1,290 @@ +# About `mir-json` + +We are interested in verifying code written in Rust, but Rust is an extremely +rich programming language that has many distinct language features. To make the +process of verifying Rust simpler, SAW targets an intermediate language used in +the Rust compiler called [MIR](https://blog.rust-lang.org/2016/04/19/MIR.html) +(short for "Mid-level IR"). MIR takes the variety of different features and +syntactic extensions in Rust and boils them down to a minimal subset that is +easier for a computer program to analyze. + +The process of extracting MIR code that is suitable for SAW's needs is somewhat +tricky, so we wrote a suite of tools called `mir-json` to automate this +process. `mir-json` provides a plugin for tools like `rustc` and `cargo` that +lets you compile Rust code as you normally would and produce an additional +`.json` file as output. This `.json` file contains the MIR code that was +produced internally by the Rust compiler, with some additional minor tweaks to +make it suitable for SAW's needs. + +`mir-json` is not a single tool but rather a suite of related tools that +leverage the same underlying plugin. For SAW purposes, the two `mir-json` tools +that are most relevant are: + +* `saw-rustc`: A thin wrapper around `rustc` (the Rust compiler), which is + suitable for individual `.rs` files. +* `cargo-saw-build`: A thin wrapper around the `cargo build` command, which is + suitable for `cargo`-based Rust projects. + +Most of the examples in this tutorial involve self-contained examples, which +will use `saw-rustc`. Later in the tutorial, we will examine a Salsa20 case +study that involves a `cargo`-based project, which will use `cargo-saw-build`. + +## A first example with `saw-rustc` + +Let's try out `saw-rustc` on a small example file, which we'll name +`first-example.rs`: + +:::{literalinclude} code/first-example.rs +:language: rust +::: + +This is the identity function, but specialized to the type `u8`. We can compile +this with `saw-rustc` like so: + +:::{code-block} console +$ saw-rustc first-example.rs + +note: Emitting MIR for first_example/abef32c5::id_u8 + +linking 1 mir files into first-example.linked-mir.json + +::: + +`saw-rustc` prints out some additional information that `rustc` alone does not +print, and we have displayed the parts of this information that are most +interesting. In particular: + +* `saw-rustc` notes that is is `Emitting MIR for first_example/abef32c5::id_u8`, + where `first_example/abef32c5::id_u8` is the full _identifier_ used to uniquely + refer to the `id_u8` function. It's entirely possible that the `abef32c5` + bit will look different on your machine; we'll talk more about identifiers in + the "Identifiers" section. +* Once `saw-rustc` produced a MIR JSON file named + `first-example.linked-mir.json`. This is an important bit of information, as + SAW will ingest this JSON file. + +If you'd like, you can inspect the `first-example.linked-mir.json` file with +JSON tools (e.g., [`jq`](https://jqlang.github.io/jq/)), but it is not +important to understand everything that is going on there. This is +machine-generated JSON, and as such, it is not meant to be particularly +readable by human eyes. + +(saw-rust-lib-path-env-var)= +## The `SAW_RUST_LIBRARY_PATH` environment variable + +Rust has a large set of standard libraries that ship with the compiler, and +parts of the standard library are quite low-level and tricky. SAW's primary +objective is to provide a tool that can analyze code in a tractable way. For +this reason, SAW sometimes needs to invoke simplified versions of Rust standard +library functions that are more reasonable for an SMT-based tool like SAW to +handle. These simplified functions are equivalent in behavior, but avoid using +problematic code patterns (e.g., gnarly pointer arithmetic or the +[`transmute`](https://doc.rust-lang.org/std/intrinsics/fn.transmute.html) +function). + +If you are only ever compiling self-contained pieces of code with `saw-rustc`, +there is a good chance that you can get away without needing to use SAW's +custom version of the Rust standard libraries. However, if you ever need to +build something more complicated than that (e.g., the Salsa20 case study later +in this tutorial), then you _will_ need the custom libraries. For this reason, +it is worthwhile to teach SAW the location of the custom libraries now. + +At present, the best way to obtain the custom version of the Rust standard +libraries is to perform the following steps: + +1. Clone the [`crucible`](https://github.com/GaloisInc/crucible) repo like so: + + :::{code-block} console + $ git clone https://github.com/GaloisInc/crucible + ::: + +2. Navigate to the + [`crux-mir`](https://github.com/GaloisInc/crucible/tree/master/crux-mir) + subdirectory of the `crucible` checkout: + + :::{code-block} console + $ cd crucible/crux-mir/ + ::: + +3. Run the `translate_libs.sh` script: + + :::{code-block} console + $ ./translate_libs.sh + ::: + + This will compile the custom versions of the Rust standard libraries using + `mir-json`, placing the results under the `rlibs` subdirectory. + +4. Finally, define a `SAW_RUST_LIBRARY_PATH` environment variable that points + to the newly created `rlibs` subdirectory: + + :::{code-block} console + $ export SAW_RUST_LIBRARY_PATH=<...>/crucible/crux-mir/rlibs + ::: + +An upcoming release of SAW will include these custom libraries pre-built, which +will greatly simplify the steps above. Either way, you will need to set the +`SAW_RUST_LIBRARY_PATH` environment variable to point to the location of the +custom libraries. + +## A note about generics + +The `id_u8` function above is likely not how most Rust programmers would define +the identity function. Instead, it would seem more natural to define it +generically, that is, by parameterizing the function by a type parameter: + +:::{literalinclude} code/generics-take-1.rs +:language: rust +::: + +If you compile this with `saw-rustc`, however, the resulting JSON file will +lack a definition for `id`! We can see this by using `jq`: + +:::{code-block} console +$ saw-rustc generics-take-1.rs + +$ jq . generics-take-1.linked-mir.json +{ + "fns": [], + "adts": [], + "statics": [], + "vtables": [], + "traits": [], + "intrinsics": [], + "tys": [], + "roots": [] +} +::: + +What is going on here? This is the result of an important design choice that +SAW makes: _SAW only supports verifying monomorphic functions_. To be more +precise, SAW's approach to symbolic simulation requires all of the code being +simulated to have fixed types without any type parameters. + +In order to verify a function using generics in your Rust code, you must +provide a separate, monomorphic function that calls into the generic function. +For example, you can rewrite the example above like so: + +:::{literalinclude} code/generics-take-2.rs +:language: rust +::: + +If you compile this version with `saw-rustc`, you'll see: + +:::{code-block} console +$ saw-rustc generics-take-2.rs + +note: Emitting MIR for generics_take_2/8b1bf337::id_u8 + +note: Emitting MIR for generics_take_2/8b1bf337::id::_instaddce72e1232152c[0] + +linking 1 mir files into generics-take-2.linked-mir.json + +::: + +This time, the resulting JSON file contains a definition for `id_u8`. The +reason that this works is because when `id_u8` calls `id`, the Rust compile +will generate a specialized version of `id` where `A` is instantiated with the +type `u8`. This specialized version of `id` is named +`id::_instaddce72e1232152c[0]` in the output above. (You don't have to remember +this name, thankfully!) + +Although the resulting JSON file contains a definition for `id_u8`, it does +_not_ contain a definition for the generic `id` function. As a result, SAW will +only be able to verify the `id_u8` function from this file. If you are ever in +doubt about which functions are accessible for verification with SAW, you can +check this with `jq` like so: + +:::{code-block} console +$ jq '.intrinsics | map(.name)' generics-take-2.linked-mir.json +[ + "generics_take_2/8b1bf337::id_u8", + "generics_take_2/8b1bf337::id::_instaddce72e1232152c[0]" +] +::: + +Here, "intrinsics" are monomorphic functions that are visible to SAW. Note that +`saw-rustc` will optimize away all definitions that are not accessible from one +of these intrinsic functions. This explains why the original program that only +defined a generic `id` function resulted in a definition-less JSON file, as +that program did not contain monomorphic functions (and therefore no +intrinsics). + +Generally speaking, we prefer to verify functions that are defined directly in +the Rust source code, such as `id_u8`, as these functions' names are more +stable than the specialized versions of functions that the compiler generates, +such as `id::_instaddce72e1232152c[0]`. Do note that SAW is capable of +verifying both types of functions, however. (We will see an example of +verifying an autogenerated function in the Salsa20 case study later in this +tutorial.) + +## Identifiers + +When you compile a function named `id_u8`, `saw-rustc` will expand it to a much +longer name such as `first_example/abef32c5::id_u8`. This longer name is called +an _identifier_, and it provides a globally unique name for that function. In +the small examples we've seen up to this point, there hasn't been any risk of +name collisions, but you could imagine compiling this code alongside another +file (or crate) that also defines an `id_u8` function. If that happens, then it +is essential that we can tell apart all of the different `id_u8` functions, and +identifiers provide us the mechanism for doing so. + +Let's take a closer look at what goes into an identifier. In general, an identifier +will look like the following: + +:::{code-block} +/:: +::: + +`` is the name of the crate in which the function is defined. All +of the examples we've seen up to this point have been defined in standalone +files, and as a result, the crate name has been the same as the file name, but +without the `.rs` extension and with all hyphens replaced with underscores +(e.g., `first-example.rs` is given the crate name `first_example`). In +`cargo`-based projects, the crate name will likely differ from the file name. + +`` is a hash of the crate and its dependencies. In extreme +cases, it is possible for two different crates to have identical crate names, +in which case the disambiguator must be used to distinguish between the two +crates. In the common case, however, most crate names will correspond to +exactly one disambiguator. (More on this in a bit.) + +`` is the path to the function within the crate. Sometimes, this +is as simple as the function name itself. In other cases, a function path may +involve multiple _segments_, depending on the module hierarchy for the program +being verified. For instance, a `read` function located in +`core/src/ptr/mod.rs` will have the identifier: + +:::{code-block} +core/::ptr::read +::: + +Where `core` is the crate name and `ptr::read` is the function path, which +has two segments `ptr` and `read`. There are also some special forms of +segments that appear for functions defined in certain language constructs. +For instance, if a function is defined in an `impl` block, then it will have +`{impl}` as one of its segments, e.g., + +:::{code-block} +core/::ptr::const_ptr::{impl}::offset +::: + +The most cumbersome part of writing an identifier is the disambiguator, as it +is extremely sensitive to changes in the code (not to mention hard to remember +and type). Luckily, the vast majority of crate names correspond to exactly one +disambiguator, and we can exploit this fact to abbreviate identifiers that live +in such crates. For instance, we can abbreviate this identifier: + +:::{code-block} +core/::ptr::read +::: + +To simply: + +:::{code-block} +core::ptr::read +::: + +We will adopt the latter, shorter notation throughout the rest of the tutorial. +SAW also understands this shorthand, so we will also use this notation when +passing identifiers to SAW commands. diff --git a/doc/rust-verification-with-saw/case-study-salsa20.md b/doc/rust-verification-with-saw/case-study-salsa20.md new file mode 100644 index 0000000000..f80d83a032 --- /dev/null +++ b/doc/rust-verification-with-saw/case-study-salsa20.md @@ -0,0 +1,1337 @@ +# Case study: Salsa20 + +If you've made it this far into the tutorial, congrats! You've now been exposed +to all of the SAW fundamentals that you need to verify Rust code found in the +wild. Of course, talking about verifying real-world code is one thing, but +actually _doing_ the verification is another thing entirely. Making the jump +from the small examples to "industrial-strength" code can be intimidating. + +To make this jump somewhat less frightening, the last part of this tutorial +will consist of a case study in using SAW to verify a non-trivial piece of Rust +code. In particular, we will be looking at a Rust implementation of the +[Salsa20](https://en.wikipedia.org/wiki/Salsa20) stream cipher. We do not +assume any prior expertise in cryptography or stream ciphers in this tutorial, +so don't worry if you are not familiar with Salsa20. + +More than anything, this case study is meant to emphasize that verification is +an iterative process. It's not uncommon to try something with SAW and encounter +an error message. That's OK! We will explain what can go wrong when verifying +Salsa20 and how to recover from these mistakes. Later, if you encounter similar +issues when verifying your own code with SAW, the experience you have developed +when developing these proofs can inform you of possible ways to fix the issues. + +## The `salsa20` crate + +The code for this Salsa20 implementation we will be verifying can be found +under the +[`code/salsa20`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20) +subdirectory. This code is adapted from version 0.3.0 of the `salsa20` crate, +which is a part of the +[`stream-ciphers`](https://github.com/RustCrypto/stream-ciphers) project. The +code implements Salsa20 as well as variants such as HSalsa20 and XSalsa20, but +we will only be focusing on the original Salsa20 cipher in this tutorial. + +The parts of the crate that are relevant for our needs are mostly contained in +the +[`src/core.rs`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/src/core.rs) +file, as well as some auxiliary definitions in the +[`src/rounds.rs`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/src/rounds.rs) +and +[`src/lib.rs`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/src/lib.rs) +files. You can take a look at these files if you'd like, but you don't need to +understand everything in them just yet. We will introduce the relevant parts of +the code in the tutorial as they come up. + +## Salsa20 preliminaries + +Salsa20 is a stream cipher, which is a cryptographic technique for encrypting +and decrypting messages. A stream cipher encrypts a message by combining it +with a _keystream_ to produce a ciphertext (the encrypted message). Moreover, +the same keystream can then be combined with the ciphertext to decrypt it back +into the original message. + +The original author of Salsa20 has published a specification for Salsa20 +[here](https://cr.yp.to/snuffle/spec.pdf). This is a great starting point for a +formal verification project, as this gives us a high-level description of +Salsa20's behavior that will guide us in proving the functional correctness of +the `salsa20` crate. When we say that `salsa20` is functionally correct, we +really mean "proven correct with respect to the Salsa20 specification". + +The first step in our project would be to port the Salsa20 spec to Cryptol +code, as we will need to use this code when writing SAW proofs. The process of +transcribing an English-language specification to executable Cryptol code is +interesting in its own right, but it is not the primary focus of this tutorial. +As such, we will save you some time by providing a pre-baked Cryptol +implementation of the Salsa20 spec +[here](https://github.com/GaloisInc/saw-script/blob/master/doc/rust-tutorial/code/salsa20/Salsa20.cry). +(This implementation is +[adapted](https://github.com/GaloisInc/cryptol-specs/blob/1366ccf71db9dca58b16ff04ca7d960a4fe20e34/Primitive/Symmetric/Cipher/Stream/Salsa20.cry) +from the [`cryptol-specs`](https://github.com/GaloisInc/cryptol-specs) repo.) + +Writing the Cryptol version of the spec is only half the battle, however. We +still have to prove that the Rust implementation in the `salsa20` crate adheres +to the behavior prescribed by the spec, which is where SAW enters the picture. +As we will see shortly, the code in `salsa20` is not a direct port of the +pseudocode shown in the Salsa20 spec, as it is somewhat more low-level. SAW's +role is to provide us assurance that the behavior of the low-level Rust code +and the high-level Cryptol code coincide. + +## A note about cryptographic security + +As noted in the previous section, our goal is to prove that the behavior of +`salsa20` functions is functionally correct. This property should _not_ be +confused with cryptographic security. While functional correctness is an +important aspect of cryptographic security, a full cryptographic security audit +would encompass additional properties such as whether the code runs in constant +time on modern CPUs. As such, the SAW proofs we will write would not constitute +a full security audit (and indeed, the [`salsa20` +`README`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/README.md) +states that the crate has never received such an audit). + +## An overview of the `salsa20` code + +Before diving into proofs, it will be helpful to have a basic understanding of +the functions and data types used in the `salsa20` crate. Most of the +interesting code lives in +[`src/core.rs`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/src/core.rs). +At the top of this file, we have the `Core` struct: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 8-14 +:language: rust +::: + +Let's walk through this: + +* The `state` field is an array that is `STATE_WORDS` elements long, where + `STATE_WORDS` is a commonly used alias for `16`: + + :::{literalinclude} code/salsa20/src/lib.rs + :lines: 88-89 + :language: rust + ::: + +* The `rounds` field is of type `PhantomData`. If you haven't seen it + before, + [`PhantomData`](https://doc.rust-lang.org/std/marker/struct.PhantomData.html) + is a special type that tells the Rust compiler to pretend as though the + struct is storing something of type `R`, even though a `PhantomData` value + will not take up any space at runtime. + +The reason that `Core` needs a `PhantomData` field is because `R` +implements the `Rounds` trait: + +:::{literalinclude} code/salsa20/src/rounds.rs +:lines: 1-5 +:language: rust +::: + +A core operation in Salsa20 is hashing its input through a series of +_rounds_. The `COUNT` constant indicates how many rounds should be performed. +The Salsa20 spec assumes 20 rounds: + +:::{literalinclude} code/salsa20/src/rounds.rs +:lines: 23-29 +:language: rust +::: + +However, there are also reduced-round variants that perform 8 and 12 rounds, +respectively: + +:::{literalinclude} code/salsa20/src/rounds.rs +:lines: 7-21 +:language: rust +::: + +Each number of rounds has a corresponding struct whose names begins with the +letter `R`. For instance, a `Core` value represents a 20-round Salsa20 +cipher. Here is the typical use case for a `Core` value: + +* A `Core` value is created using the `new` function: + + :::{literalinclude} code/salsa20/src/core.rs + :lines: 18 + :language: rust + ::: + + We'll omit the implementation for now. This function takes a secret `Key` + value and a unique `Nonce` value and uses them to produce the initial `state` + in the `Core` value. + +* After creating a `Core` value, the `counter_setup` and `rounds` functions are + used to produce the Salsa20 keystream: + + :::{literalinclude} code/salsa20/src/core.rs + :lines: 83 + :language: rust + ::: + + :::{literalinclude} code/salsa20/src/core.rs + :lines: 90 + :language: rust + ::: + + We'll have more to say about these functions later. + +* The _pièce de résistance_ is the `apply_keystream` function. This takes a + newly created `Core` value, produces its keystream, and applies it to a + message to produce the `output`: + + :::{literalinclude} code/salsa20/src/core.rs + :lines: 68 + :language: rust + ::: + +Our ultimate goal is to verify the `apply_keystream` function, which is the +Rust equivalent of the Salsa20 encryption function described in the spec. + +## Building `salsa20` + +The next step is to build the `salsa20` crate. Unlike the examples we have seen +up to this point, which have been self-contained Rust files, `salsa20` is a +`cargo`-based project. As such, we will need to build it using `cargo +saw-build`, an extension to the `cargo` package manager that integrates with +`mir-json`. Before you proceed, make sure that you have defined the +`SAW_RUST_LIBRARY_PATH` environment variable as described in [this +section](#saw-rust-lib-path-env-var). + +To build the `salsa20` crate, perform the following steps: + +:::{code-block} console +$ cd code/salsa20/ +$ cargo saw-build +::: + +Near the end of the build output, you will see a line that looks like this: + +:::{code-block} console +linking 9 mir files into <...>/saw-script/doc/rust-tutorial/code/salsa20/target/x86_64-unknown-linux-gnu/debug/deps/salsa20-dd0d90f28492b9cb.linked-mir.json +::: + +This is the location of the MIR JSON file that we will need to provide to SAW. +(When we tried it, the hash in the file name was `dd0d90f28492b9cb`, but it +could very well be different on your machine.) Due to how `cargo` works, the +location of this file is in a rather verbose, hard-to-remember location. For +this reason, we recommend copying this file to a different path, e.g., + +:::{code-block} console +$ cp <...>/saw-script/doc/rust-tutorial/code/salsa20/target/x86_64-unknown-linux-gnu/debug/deps/salsa20-dd0d90f28492b9cb.linked-mir.json code/salsa20/salsa20.linked-mir.json +::: + +As a safeguard, we have also checked in a compressed version of this MIR JSON +file as +[`code/salsa20/salsa/salsa20.linked-mir.json.tar.gz`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/salsa20.linked-mir.json.tar.gz). +In a pinch, you can extract this archive to obtain a copy of the MIR JSON file, +which is approximately 4.6 megabytes when uncompressed. + +## Getting started with SAW + +Now that we've built the `salsa20` crate, it's time to start writing some +proofs! Let's start a new `code/salsa20/salsa20.saw` file as fill it in with +the usual preamble: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 1-3 +:language: sawscript +::: + +We are also going to need to make use of the Cryptol implementation of the +Salsa20 spec, which is defined in +[`code/salsa20/Salsa20.cry`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/Salsa20.cry). +SAW allows you to import standalone Cryptol `.cry` files by using the `import` +command: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 4 +:language: sawscript +::: + +As an aside, note that we have also checked in a +[`code/salsa20/salsa20-reference.saw`](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20/salsa20-reference.saw), +which contains a complete SAW file. We encourage you _not_ to look at this file +for now, since following along with the tutorial is meant to illustrate the +"a-ha moments" that one would have in the process of writing the proofs. In you +become stuck while following along and absolutely need a hint, however, then +this file can help you become unstuck. + +## Verifying our first `salsa20` function + +Now it's time to start verifying some `salsa20` code. But where do we start? +It's tempting to start with `apply_keystream`, which is our end goal. This is +likely going to be counter-productive, however, as `apply_keystream` is a +large function with several moving parts. Throwing SAW at it immediately is +likely to cause it to spin forever without making any discernible progress. + +For this reason, we will instead take the approach of working from the +bottom-up. That is, we will first verify the functions that `apply_keystream` +transitively invokes, and then leverage compositional verification to verify a +proof of `apply_keystream` using overrides. This approach naturally breaks up +the problem into smaller pieces that are easier to understand in isolation. + +If we look at the implementation of `apply_keystream`, we see that it invokes +the `round` function, which in turn invokes the `quarter_round` function: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 122-142 +:language: rust +::: + +`quarter_round` is built on top of the standard library functions +[`wrapping_add`](https://doc.rust-lang.org/std/primitive.usize.html#method.wrapping_add) +and +[`rotate_left`](https://doc.rust-lang.org/std/primitive.usize.html#method.rotate_left), +so we have finally reached the bottom of the call stack. This makes +`quarter_round` a good choice for the first function to verify. + +The implementation of the Rust `quarter_round` function is quite similar to the +Cryptol `quarterround` function in `Salsa20.cry`: + +:::{literalinclude} code/salsa20/Salsa20.cry +:lines: 10-16 +:language: cryptol +::: + +The Cryptol `quarterround` function doesn't have anything like the `state` +argument in the Rust `quarter_round` function, but let's not fret about that +too much yet. Our SAW spec is going to involve `quarterround` _somehow_—we just +have to figure out how to make it fit. + +Let's start filling out the SAW spec for `quarter_round`: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail1.saw +:lines: 6 +:language: sawscript +::: + +We are going to need some fresh variables for the `a`, `b`, `c`, and `d` +arguments: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail1.saw +:lines: 7-10 +:language: sawscript +::: + +We will also need to allocate a reference for the `state` argument. The +reference's underlying type is `STATE_WORDS` (`16`) elements long: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail1.saw +:lines: 11-13 +:language: sawscript +::: + +Finally, we will need to pass these arguments to the function: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail1.saw +:lines: 15-21 +:language: sawscript +::: + +With that, we have a spec for `quarter_round`! It's not very interesting just +yet, as we don't specify what `state_ref` should point to after the function +has returned. But that's fine for now. When developing a SAW proof, it can be +helpful to first write out the "skeleton" of a function spec that only contains +the call to `mir_execute_func`, without any additional preconditions or +postconditions. We can add those later after ensuring that the skeleton works +as expected. + +Let's check our progress thus far by running this through SAW: + +:::{code-block} console +$ saw salsa20.saw +... +[23:16:05.080] Type errors: + salsa20.saw:12:39-12:68: Unbound variable: "STATE_WORDS" (salsa20.saw:12:49-12:60) +Note that some built-in commands are available only after running +either `enable_deprecated` or `enable_experimental`. + + salsa20/salsa20.saw:11:31-11:60: Unbound variable: "STATE_WORDS" (salsa20.saw:11:41-11:52) +Note that some built-in commands are available only after running +either `enable_deprecated` or `enable_experimental`. +::: + +We've already run into some type errors. Not too surprising, considering this +was our first attempt. The error message contains that `STATE_WORDS` is +unbound. This makes sense if you think about it, as `STATE_WORDS` is defined in +the Rust code, but not in the SAW file itself. Let's fix that by adding this +line to `salsa20.saw`: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail2.saw +:lines: 6 +:language: sawscript +::: + +That change fixes the type errors in `quarter_round_spec`. Hooray! Let's press +on. + +Next, we need to add a call to `mir_verify`. In order to do this, we need to +know what the full identifier for the `quarter_round` function is. Because it +is defined in the `salsa20` crate and in the `core.rs` file, so we would expect +the identifier to be named `salsa20::core::quarter_round`: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail2.saw +:lines: 25-26 +:language: sawscript +::: + +However, SAW disagrees: + +:::{code-block} console +[00:22:56.970] Stack trace: +"mir_verify" (salsa20.saw:26:3-26:13) +Couldn't find MIR function named: salsa20::core::quarter_round +::: + +Ugh. This is a consequence of how `mir-json` disambiguates identifiers. Because +there is a separate `core` crate in the Rust standard libraries, `mir-json` +uses "`core#1`", a distinct name, to refer to the `core.rs` file. You can see +this for yourself by digging around in the MIR JSON file, if you'd like. (In a +future version of SAW, one will be able to [look this name +up](https://github.com/GaloisInc/saw-script/issues/1980) more easily.) + +Once we change the identifier: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail3.saw +:lines: 25-26 +:language: sawscript +::: + +We can run SAW once more. This time, SAW complains about a different thing: + +:::{code-block} console +[01:00:19.697] Verifying salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[01:00:19.714] Simulating salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[01:00:19.717] Checking proof obligations salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[01:00:19.739] Subgoal failed: salsa20/10e438b3::core#1[0]::quarter_round[0] index out of bounds: the length is move _10 but the index is _9 +[01:00:19.739] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 53} +[01:00:19.739] ----------Counterexample---------- +[01:00:19.739] a: 2147483648 +::: + +Here, SAW complains that we have an `index out of bounds`. Recall that we are +indexing into the `state` array, which is of length 16, using the +`a`/`b`/`c`/`d` arguments. Each of these arguments are of type `usize`, and +because we are declaring these to be symbolic, it is quite possible for each +argument to be 16 or greater, which would cause the index into `state` to be +out of bounds. + +In practice, however, the only values of `a`/`b`/`c`/`d` that we will use are +less than 16. We can express this fact as a precondition: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail4.saw +:lines: 13-16 +:language: sawscript +::: + +That is enough to finally get SAW to verify this very stripped-down version of +`quarter_round_spec`. Some good progress! But we aren't done yet, as we don't +yet say what happens to the value that `state` points to after the function +returns. This will be a requirement if we want to use `quarter_round_spec` in +compositional verification (and we do want this), so we should address this +shortly. + +Recall that unlike the Rust `quarter_round` function, the Cryptol +`quarterround` function doesn't have a `state` argument. This is because the +Rust function does slightly more than what the Cryptol function does. The Rust +function will look up elements of the `state` array, use them to perform the +computations that the Cryptol function does, and then insert the new values +back into the `state` array. To put it another way: the Rust function can be +thought of as a wrapper around the Cryptol function that also performs an +in-place bulk update of the `state` array. + +In Cryptol, one can look up elements of an array using the `(@@)` function, +and one can perform in-place array updates using the `updates` function. +This translates into a postcondition that looks like this: + +:::{literalinclude} code/salsa20/salsa20-quarter_round-fail4.saw +:lines: 28-30 +:language: sawscript +::: + +What does SAW think of this? Someone surprisingly, SAW finds a counterexample: + +:::{code-block} console +[01:43:30.065] Verifying salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[01:43:30.078] Simulating salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[01:43:30.084] Checking proof obligations salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[01:43:30.801] Subgoal failed: salsa20/10e438b3::core#1[0]::quarter_round[0] Literal equality postcondition + +[01:43:30.801] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 1999} +[01:43:30.802] ----------Counterexample---------- +[01:43:30.802] a: 13 +[01:43:30.802] b: 3 +[01:43:30.802] c: 0 +[01:43:30.802] d: 0 +[01:43:30.802] state: [3788509705, 0, 0, 3223325776, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1074561051, 0, 0] +::: + +Note that in this counterexample, the values of `c` and `d` are the same. In +the Rust version of the function, each element in `state` is updated +sequentially, so if two of the array indices are the same, then the value that +was updated with the first index will later be overwritten by the value at the +later index. In the Cryptol version of the function, however, all of the +positions in the array are updated simultaneously. This implicitly assumes that +all of the array indices are disjoint from each other, an assumption that we +are not encoding into `quarter_round_spec`'s preconditions. + +At this point, it can be helpful to observe _how_ the `quarter_round` function +is used in practice. The call sites are found in the `rounds` function: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 92-102 +:language: rust +::: + +Here, we can see that the values of `a`/`b`/`c`/`d` will only ever be chosen +from a set of eight possible options. We can take advantage of this fact to +constrain the possible set of values for `a`/`b`/`c`/`d`. The latest iteration +of the `quarter_round_spec` is now: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 10-28 +:language: sawscript +::: + +Note that: + +* The `indices` value is constrained (via a precondition) to be one of the set + of values that is chosen in the `rounds` function. (Note that `\/` is the + logical-or function in Cryptol.) Each of these are concrete values that are + less than `STATE_WORDS` (`16`), so we no longer need a precondition stating + ``a < `STATE_WORDS /\ ...``. + +* Because we now reference `indices` in the preconditions, we have moved its + definition up. (Previously, it was defined in the postconditions section.) + +With this in place, will SAW verify `quarter_round_spec` now? + +:::{code-block} console +[02:14:02.037] Verifying salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[02:14:02.051] Simulating salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[02:14:02.057] Checking proof obligations salsa20/10e438b3::core#1[0]::quarter_round[0] ... +[02:14:18.616] Proof succeeded! salsa20/10e438b3::core#1[0]::quarter_round[0] +::: + +At long last, it succeeds. Hooray! SAW does have to think for a while, however, +as this proof takes about 17 seconds to complete. It would be unfortunate to +have to wait 17 seconds on every subsequent invocation of SAW, and since we +still have other functions to verify, this is a very real possibility. For this +reason, it can be helpful to temporarily turn this use of `mir_verify` into a +`mir_unsafe_assume_spec`: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 40-44 +:language: sawscript +::: + +Once we are done with the entire proof, we can come back and remove this use of +`mir_unsafe_assume_spec`, as we're only using it as a time-saving measure. + +## Verifying the `rounds` function + +Now that we've warmed up, let's try verifying the `rounds` function, which is +where `quarter_round` is invoked. Here is the full definition of `rounds`: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 90-108 +:language: rust +::: + +First, `rounds` performs `COUNT` rounds on the `state` argument. After this, it +takes each element of `self.state` and adds it to the corresponding element in +`state`. + +Linking back at the Salsa20 spec, we can see that the `rounds` function is +_almost_ an implementation of the Salsa20(x) hash function. The only notable +difference is that while the Salsa20(x) hash function converts the results to +little-endian form, the `rounds` function does not. `Salsa20.cry` implements +this part of the spec here: + +:::{literalinclude} code/salsa20/Salsa20.cry +:lines: 139-147 +:language: cryptol +::: + +Where `Salsa20` is the hash function, and `Salsa20_rounds` is the part of the +hash function that excludes the little-endian conversions. In other words, +`Salsa20_rounds` precisely captures the behavior of the Rust `rounds` function. + +One aspect of the `rounds` function that will make verifying it slightly +different from verifying `quarter_rounds` is that `rounds` is defined in an +`impl` block for the `Core` struct. This means that the `&mut self` argument in +`rounds` has the type `&mut Core`. As such, we will have to look up the +`Core` ADT in SAW using `mir_find_adt`. + +This raises another question, however: when looking up `Core`, what type +should we use to instantiate `R`? As noted above, our choices are `R8`, `R12`, +and `R20`, depending on how many rounds you want. For now, we'll simply +hard-code it so that `R` is instantiated to be `R8`, but we will generalize +this a bit later. + +Alright, enough chatter—time to start writing a proof. First, let's look up the +`R8` ADT. This is defined in the `salsa20` crate in the `rounds.rs` file, so +its identifier becomes `salsa20::rounds::R8`: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 46 +:language: sawscript +::: + +Next, we need to look up the `PhantomData` ADT, which is used in the +`rounds` field of the `Core` struct. This is defined in `core::marker`: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 47 +:language: sawscript +::: + +Finally, we must look up `Core` itself. Like `quarter_round`, the `Core` +struct is defined in `salsa20::core#1`: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 48 +:language: sawscript +::: + +Now that we have the necessary prerequisites, let's write a spec for the +`rounds` function. First, we need to allocate a reference for the `self` +argument: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 50-51 +:language: sawscript +::: + +Next, we need to create symbolic values for the fields of the `Core` struct, +which `self_ref` will point to. The `self.state` field will be a fresh array, +and the `self.rounds` field will be a simple, empty struct value: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 52-53 +:language: sawscript +::: + +Finally, putting all of the `self` values together: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 54-55 +:language: sawscript +::: + +Next, we need a `state` argument (not to be confused with the `self.state` +field in `Core`). This is handled much the same as it was in +`quarter_round_spec`: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 57-59 +:language: sawscript +::: + +Lastly, we cap it off with a call to `mir_execute_func`: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 61-62 +:language: sawscript +::: + +(Again, we're missing some postconditions describing what `self_ref` and +`state_ref` point to after the function returns, but we'll return to that in a +bit.) + +If we run SAW at this point, we see that everything in `rounds_spec` +typechecks, so we're off to a good start. Let's keep going and add a +`mir_verify` call. + +Here, we are faced with an interesting question: what is the identifier for +`rounds::`? The `rounds` function is defined using generics, so we can't +verify it directly—we must instead verify a particular instantiation of +`rounds`. At present, there isn't a good way to look up what the identifiers +for instantiations of generic functions are (there [will be in the +future](https://github.com/GaloisInc/saw-script/issues/1980)), but it turns out +that the identifier for `rounds::` is this: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-1.saw +:lines: 64-65 +:language: sawscript +::: + +Note that we are using `quarter_round_ov` as a compositional override. Once +again, SAW is happy with our work thus far: + +:::{code-block} console +[03:12:35.990] Proof succeeded! salsa20/10e438b3::core#1[0]::{impl#0}[0]::rounds[0]::_inst6e4a2d7250998ef7[0] +::: + +Nice. Now let's go back and fill in the missing postconditions in +`rounds_spec`. In particular, we must declare what happens to both `self_ref` +and `state_ref`. A closer examination of the code in the Rust `rounds` function +reveals that the `self` argument is never modified at all, so that part is +easy: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-2.saw +:lines: 63 +:language: sawscript +::: + +The `state` argument, on the other hand, is modified in-place. This time, our +job is made easier by the fact that `Salsa20_rounds` implements _exactly_ what +we need. Because we are instantiating `rounds` at type `R8`, we must explicitly +state that we are using 8 rounds: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-2.saw +:lines: 64 +:language: sawscript +::: + +Once again, SAW is happy with our work. We're on a roll! + +Now let's address the fact that we are hard-coding everything to `R8`, which is +somewhat uncomfortable. We can make things better by allowing the user to +specify the number of rounds. The first thing that we will need to change is +the `r_adt` definition, which is response for looking up `R8`. We want to turn +this into a function that, depending on the user input, will look up `R8`, +`R12`, or `R20`: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-3.saw +:lines: 46 +:language: sawscript +::: + +Where `str_concat` is a SAW function for concatenating strings together: + +:::{code-block} console +sawscript> :type str_concat +String -> String -> String +::: + +We also want to parameterize `phantom_data_adt` and `core_adt`: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-3.saw +:lines: 47-48 +:language: sawscript +::: + +Next, we need to parameterize `rounds_spec` by the number of rounds. This will +require changes in both the preconditions and postconditions. On the +preconditions side, we must pass the number of rounds to the relevant +functions: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-3.saw +:lines: 50-56 +:language: sawscript +::: + +And on the postconditions side, we must pass the number of rounds to the +Cryptol `Salsa20_rounds` function: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-3.saw +:lines: 66-67 +:language: sawscript +::: + +Finally, we must adjust the call to `rounds_spec` in the context of +`mir_verify` so that we pick `8` as the number of rounds: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-3.saw +:lines: 69-70 +:language: sawscript +::: + +SAW is happy with this generalization. To demonstrate that we have generalized +things correctly, we can also verify the same function at `R20` instead of +`R8`: + +:::{literalinclude} code/salsa20/salsa20-rounds-take-3.saw +:lines: 71-72 +:language: sawscript +::: + +The only things that we had to change were the identifier and the argument to +`rounds_spec`. Not bad! + +## Verifying the `counter_setup` function + +We're very nearly at the point of being able to verify `apply_keystream`. +Before we do, however, there is one more function that `apply_keystream` calls, +which we ought to verify first: `counter_setup`. Thankfully, the implementation +of `counter_setup` is short and sweet: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 83-86 +:language: rust +::: + +This updates the elements of the `state` array at indices `8` and `9` with the +lower 32 bits and the upper 32 bits of the `counter` argument, respecitvely. +At a first glance, there doesn't appear to be any function in `Salsa20.cry` +that directly corresponds to what `counter_setup` does. This is a bit of a +head-scratcher, but the answer to this mystery will become more apparent as we +get further along in the proof. + +For now, we should take matters into our own hands and write our own Cryptol +spec for `counter_setup`. To do this, we will create a new Cryptol file named +`Salsa20Extras.cry`, which imports from `Salsa20.cry`: + +:::{literalinclude} code/salsa20/Salsa20Extras.cry +:lines: 1-3 +:language: cryptol +::: + +The Cryptol implementation of `counter_setup` will need arrays of length +`STATE_WORDS`, so we shall define `STATE_WORDS` first: + +:::{literalinclude} code/salsa20/Salsa20Extras.cry +:lines: 5 +:language: cryptol +::: + +Note that we preceded this definition with the `type` keyword. In Cryptol, +sequence lengths are encoded at the type level, so if we want to use +`STATE_WORDS` at the type level, we must declare it as a `type`. + +Finally, we can write a Cryptol version of `counter_setup` using our old friend +`updates` to perform a bulk sequence update: + +:::{literalinclude} code/salsa20/Salsa20Extras.cry +:lines: 8-10 +:language: cryptol +::: + +Note that `counter` is a 64-bit word, but the elements of the `state` sequence +are 32-bit words. As a result, we have to explicitly truncate `counter` and +`counter >> 32` to 32-bit words by using the `drop` function, which drops the +first 32 bits from each word. + +Returning to `salsa20.saw`, we must now make use of our new Cryptol file by +`import`ing it at the top: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 5 +:language: sawscript +::: + +With the `counter_setup` Cryptol implementation in scope, we can now write +a spec for the Rust `counter_setup` function. There's not too much to remark +on here, as the spec proves relatively straightforward to write: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 74-90 +:language: sawscript +::: + +We can now verify `counter_setup` against `counter_setup_spec` at lengths `8` +and `20`: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 92-95 +:language: sawscript +::: + +That wasn't so bad. It's a bit unsatisfying that we had to resort to writing a +Cryptol function not found in `Salsa20.cry`, but go along with this for now—it +will become apparent later why this needed to be done. + +## Verifying the `apply_keystream` function (first attempt) + +It's time. Now that we've verified `rounds` and `counter_setup`, it's time to +tackle the topmost function in the call stack: `apply_keystream`: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 68-80 +:language: rust +::: + +There aren't _that_ many lines of code in this function, but there is still +quite a bit going on. Let's walk through `apply_keystream` in more detail: + +1. The `output` argument represents the message to encrypt (or decrypt). + `output` is a slice of bytes, so in principle, `output` can have an arbitrary + length. That being said, the first line of `apply_keystream`'s implementation + checks that `output`'s length is equal to `BLOCK_SIZE`: + + :::{literalinclude} code/salsa20/src/core.rs + :lines: 69 + :language: rust + ::: + + Where `BLOCK_SIZE` is defined here: + + :::{literalinclude} code/salsa20/src/lib.rs + :lines: 82-83 + :language: rust + ::: + + So in practice, `output` must have exactly 64 elements. + +2. Next, `apply_keystream` invokes the `counter_setup` and `rounds` functions + to set up the keystream (the local `state` variable). + +3. Finally, `apply_keystream` combines the keystream with `output`. It does + so by chunking `output` into a sequence of 4 bytes, and then it XOR's the + value of each byte in-place with the corresponding byte from the keystream. + This performs little-endian conversions as necessary. + +The fact that we are XOR'ing bytes strongly suggests that this is an +implementation of the Salsa20 encryption function from the spec. There is an +important difference between how the Salsa20 spec defines the encryption +function versus how `apply_keystream` defines it, however. In the Salsa20 spec, +encryption is a function of a key, nonce, and a message. `apply_keystream`, on +the other hand, is a function of `self`'s internal state, a counter, and a +message. The two aren't _quite_ the same, which is makes it somewhat tricky to +describe one in terms of the other. + +`Salsa20.cry` defines a straightforward Cryptol port of the Salsa20 encryption +function from the spec, named `Salsa20_encrypt`. Because it takes a key and a +nonce as an argument, it's not immediately clear how we'd tie this back to +`apply_keystream`. But no matter: we can do what we did before and define our +own Cryptol version of `apply_keystream` in `Salsa20Extras.cry`: + +:::{literalinclude} code/salsa20/Salsa20Extras.cry +:lines: 12-17 +:language: cryptol +::: + +This implementation builds on top of the Cryptol `counter_setup` and +`Salsa20_rounds` functions, which we used as the reference implementations for +the Rust `counter_setup` and `rounds` functions, respectively. We also make +sure to define a `BLOCK_SIZE` type alias at the top of the file: + +:::{literalinclude} code/salsa20/Salsa20Extras.cry +:lines: 6 +:language: cryptol +::: + +Now let's write a SAW spec for `apply_keystream`. Once again, we will need to +reference `BLOCK_SIZE` when talking about the `output`-related parts of the +spec, so make sure to define `BLOCK_SIZE` at the top of the `.saw` file: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 8 +:language: sawscript +::: + +First, we need to declare all of our arguments, which proceeds as you would +expect: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 120-136 +:language: sawscript +::: + +What about the postconditions? We have two mutable references to contend with: +`self_ref` and `output_ref`. The postcondition for `self_ref` is fairly +straightforward: the only time it is ever modified is when `counter_setup` is +called. This means that after the `apply_keystream` function has returned, +`self_ref` will point to the results of calling the `counter_setup` Cryptol +function: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 138-140 +:language: sawscript +::: + +`output_ref` is where the interesting work happenings. After the Rust +`apply_keystream` function has returned, it will point to the results of +calling the Cryptol `apply_keystream` function that we just defined: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 141-142 +:language: sawscript +::: + +Finally, we can put this all together and verify `apply_keystream` against +`apply_keystream_spec` at lengths `8` and `20`: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 144-148 +:language: sawscript +::: + +SAW will successfully verify these. We've achieved victory... or have we? +Recall that we had to tailor the Cryptol `apply_keystream` function to +specifically match the behavior of the corresponding Rust code. This makes the +proof somewhat underwhelming, since the low-level implementation is nearly +identical to the high-level spec. + +A more impressive proof would require linking `apply_keystream` to a Cryptol +function in the `Salsa20.cry` file, which was developed independently of the +Rust code. As we mentioned before, however, doing so will force us to reconcile +the differences in the sorts of arguments that each function takes, as +`apply_keystream` doesn't take a key or nonce argument. Time to think for a +bit. + +## Verifying the `new_raw` function + +At this point, we should ask ourselves: _why_ doesn't `apply_keystream` take a +key or nonce argument? The reason lies in the fact that the `salsa20` crate +implements Salsa20 in a stateful way. Specifically, the `Core` struct stores +internal state that is used to compute the keystream to apply when hashing. In +order to use this internal state, however, we must first initialize it. The +`new` function that is responsible for this initialization: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 17-20 +:language: rust +::: + +Sure enough, this function takes a key and a nonce as an argument! This is a +critical point that we overlooked. When using the `salsa20` crate, you wouldn't +use the `apply_keystream` function in isolation. Instead, you would create an +initial `Core` value using `new`, and _then_ you would invoke +`apply_keystream`. The Salsa20 spec effectively combines both of these +operations in is encryption function, whereas the `salsa20` splits these two +operations into separate functions altogether. + +Strictly speaking, we don't need to verify `new` in order to verify +`apply_keystream`, as the latter never invokes the former. Still, it will be a +useful exercise to verify `new`, as the insights we gain when doing so will +help us write a better version of `apply_keystream_spec`. + +All that being said, we probably to verify `new_raw` (a lower-level helper +function) rather than `new` itself. This is because the definitions of `Key` +and `Nonce` are somewhat involved. For instance, `Key` is defined as: + +:::{literalinclude} code/salsa20/src/salsa.rs +:lines: 27 +:language: rust +::: + +[`GenericArray`](https://docs.rs/generic-array/latest/generic_array/struct.GenericArray.html) +is a somewhat complicated abstraction. Luckily, we don't really _need_ to deal +with it, since `new_raw` deals with simple array references rather than +`GenericArray`s: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 22-23 +:language: rust +::: + +The full implementation of `new_raw` is rather long, so we won't inline the +whole thing here. At a high level, it initializes the `state` array of a `Core` +value by populating each element of the array with various things. Some +elements of the array are populated with `key`, some parts are populated with +`iv` (i.e., the nonce), and other parts are populated with an array named +`CONSTANTS`: + +:::{literalinclude} code/salsa20/src/lib.rs +:lines: 91-92 +:language: rust +::: + +The comment about `"expand 32-byte k"` is a strong hint that `new_raw` is +implementing a portion of the Salsa20 expansion function from the spec. (No +really, the spec literally says to use the exact string `"expand 32-byte +k"`—look it up!) The `Salsa20.cry` Cryptol file has an implementation of this +portion of the expansion function, which is named `Salsa20_init`: + +:::{literalinclude} code/salsa20/Salsa20.cry +:lines: 182-189 +:language: cryptol +::: + +Note that we were careful to say a _portion_ of the Salsa20 expansion function. +There is also a Cryptol implementation of the full expansion function, named +`Salsa20_expansion`: + +:::{literalinclude} code/salsa20/Salsa20.cry +:lines: 179-180 +:language: cryptol +::: + +This calls `Salsa20_init` followed by `Salsa20`, the latter of which performs +hashing. Importantly, `new_raw` does _not_ do any hashing on its own, just +initialization. For this reason, we want to use `Salsa20_init` as the reference +implementation of `new_raw`, not `Salsa20_expansion`. + +Alright, time to write a SAW spec. The first part of the spec is straightforward: + +:::{literalinclude} code/salsa20/salsa20-new_raw-fail1.saw +:lines: 97-106 +:language: sawscript +::: + +As is usually the case, the postconditions are the tricky part. We know that +the behavior of `new_raw` will roughly coincide with the `Salsa20_init` +function, so let's try that first: + +:::{literalinclude} code/salsa20/salsa20-new_raw-fail1.saw +:lines: 108-112 +:language: sawscript +::: + +If we attempt to verify this using `mir_verify`: + +:::{literalinclude} code/salsa20/salsa20-new_raw-fail1.saw +:lines: 115-118 +:language: sawscript +::: + +SAW complains thusly: + +:::{code-block} console +Cryptol error: +[error] at salsa20.saw:109:45--109:54: + Type mismatch: + Expected type: 16 + Inferred type: 8 + Context: [ERROR] _ + When checking type of 2nd tuple field +::: + +Here, the 2nd tuple field is the `nonce_arr` in `Salsa20_init(key_arr, +nonce_arr)`. And sure enough, `Salsa20_init` expects the 2nd tuple field to be +a sequence of 16 elements, but `nonce_arr` only has 8 elements. Where do we get +the remaining 8 elements from? + +The answer to this question can be found by looking at the implementation of +`new_raw` more closely. Let's start at this code: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 35-36 +:language: rust +::: + +This will chunk up `iv` (the nonce) into two 4-byte chunks and copies them over +to the elements of `state` array at indices `6` and `7`. This is immediately +followed by two updates at indices `8` and `9`, which are updated to be `0`: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 39-40 +:language: rust +::: + +If you take the two 4-bytes chunks of `iv` and put two 4-byte `0` values after +them, then you would have a total of 16 bytes. This suggests that the nonce +value that `Salsa20_init` expects is actually this: + +:::{code-block} cryptol +nonce_arr # zero : [16][8] +::: + +Where `zero : [8][8]` is a Cryptol expression that returns all zeroes, and +`(#)` is the Cryptol operator for concatenating two sequences together. Let's +update `new_raw_spec` to reflect this: + +:::{literalinclude} code/salsa20/salsa20-new_raw-fail2.saw +:lines: 109 +:language: sawscript +::: + +This is closer to what we want, but not quite. SAW still complains: + +:::{code-block} console +could not match specified value with actual value: + ... + type of actual value: [u32; 16] + type of specified value: [u8; 64] +::: + +This is because `Salsa20_init` returns something of type `[64][8]`, which +corresponds to the Rust type `[u8; 64]`. `self.state`, on the other hand, is of +type `[u32; 16]`. These types are very close, as they both contain the same +number of bytes, but they are chunked up differently. Recall the code that +copies the nonce value over to `self.state`: + +:::{literalinclude} code/salsa20/src/core.rs +:lines: 35-36 +:language: rust +::: + +In order to resolve the type differences between `iv` and `state`, this code +needed to explicitly convert `iv` to little-endian form using the +[`u32::from_le_bytes`](https://doc.rust-lang.org/std/primitive.u32.html#method.from_le_bytes) +function. There is a similar Cryptol function in `Salsa20.cry` named +`littleendian_state`: + +:::{literalinclude} code/salsa20/Salsa20.cry +:lines: 131-132 +:language: cryptol +::: + +Note that `[64][8]` is the Cryptol equivalent of `[u8; 64]`, and `[16][32]` is +the Cryptol equivalent of `[u32; 16]`. As such, this is exactly the function +that we need to resolve the differences in types: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 109 +:language: sawscript +::: + +With that change, SAW is finally happy with `new_raw_spec` and successfully +verifies it. + +There is an interesting connection between the `new_raw` and `counter_setup` +functions. Both functions perform in-place updates on `state` at indices `8` +and `9`. Whereas `new_raw` always sets these elements of `state` to `0`, +`counter_setup` will set them to the bits of the `counter` argument (after +converting `counter` to little-endian form). This means that if you invoke +`counter_setup` right after `new_raw`, then `counter_setup` would overwrite the +`0` values with the `counter` argument. In order words, it would be tantamount +to initializing `state` like so: + +:::{code-block} cryptol +littleendian_state (Salsa20_init(key, nonce # littleendian_inverse counter)) +::: + +Where `littleendian_inverse` (a sibling of `littleendian_state`) converts a +`[64]` value to a `[8][8]` one. This pattern is a curious one... + +## Verifying the `apply_keystream` function (second attempt) + +Let's now return to the problem of linking `apply_keystream` up to +`Salsa20_encrypt`. In particular, let's take a closer look at the definition of +`Salsa20_encrypt` itself: + +:::{literalinclude} code/salsa20/Salsa20.cry +:lines: 198-201 +:language: cryptol +::: + +Does anything about this definition strike you as interesting? Take a look at +the `v#(littleendian_inverse i)` part—we _just_ saw a use of +`littleendian_inverse` earlier in our discussion about initializing the +`state`! Moreover, `v` is the nonce argument, so it is becoming clearer that +`Sals20_encrypt` is creating an initial state is much the same way that +`new_raw` is. + +A related question: what is the `i` value? The answer is somewhat technical: +the Salsa20 encryption function is designed to work with messages with +differing numbers of bytes (up to `2^^70` bytes, to be exact). Each 8-byte +chunk in the message will be encrypted with a slightly difference nonce. For +instance, the first 8-byte chunk's nonce will have its lower 32 bits set to +`0`, the second 8-byte chunk's nonce will have its lower 32 bits set to `1`, +and so on. In general, the `i`th 8-byte chunk's nonce will have its lower 32 +bits set to `i`, and this corresponds exactly to the `i` in the expression +`littleendian_inverse i`. + +Note, however, that `apply_keystream` only ever uses a message that consists of +exactly eight 8-byte chunks. This means that `Salsa20_encrypt` will only ever +invoke `Salsa20_expansion` once with a nonce value where the lower 32 bits are +set to `0`. That is, it will perform encryption with an initial state derived +from: + +:::{code-block} cryptol +Salsa20_init(k, v#(littleendian_inverse zero)) +::: + +Which can be further simplified to `Salsa20_init(k, v # zero)`. This is very +nearly what we want, as this gives us the behavior of the Rust `new_raw` +function. There's just one problem though: it doesn't take the behavior of +`counter_setup` into account. How do we go from `zero` to `littleendian_inverse +counter`? + +While `Salsa20_encrypt` doesn't take counters into account at all, it is not +too difficult to generalize `Salsa20_encrypt` in this way. There is a variant +of `Salsa20_encrypt` in the same file named `Salsa20_encrypt_with_offset`, +where the offset argument `o` serves the same role that `counter` does in +`counter_setup`: + +:::{literalinclude} code/salsa20/Salsa20.cry +:lines: 191-196 +:language: cryptol +::: + +(Observe that `Salsa20_encrypt(count, k, v, m)` is equivalent to +`Salsa20_encrypt_with_offset(count, k, v, 0, m)`.) + +At long last, we have discovered the connection between `apply_keystream` and +the Salsa20 spec. If you assume that you invoke `new_raw` beforehand, then the +behavior of `apply_keystream` corresponds exactly to that of +`Salsa20_encrypt_with_offset`. This insight will inform us how to write an +alternative SAW spec for `apply_keystream`: + +:::{literalinclude} code/salsa20/salsa20-apply_keystream_alt-fail1.saw +:lines: 149-160 +:language: sawscript +::: + +Observe the following differences between `apply_keystream_alt_spec` and our +earlier `apply_keystream_spec`: + +1. In `apply_keystream_alt_spec`, we declare fresh `key` and `nonce` values, + which weren't present at all in `apply_keystream_spec`. + +2. In `apply_keystream_alt_spec`, we no longer make `self_state` a fresh, + unconstrained value. Instead, we declare that it must be the result of + calling `Salsa20_init` on the `key`, `nonce`, and `counter` values. This + is the part that encodes the assumption that `new_raw` was invoked + beforehand. + +The parts of the spec relating to `output` remain unchanged: + +:::{literalinclude} code/salsa20/salsa20-apply_keystream_alt-fail1.saw +:lines: 162-167 +:language: sawscript +::: + +The postconditions are slightly different in `apply_keystream_alt_spec`. While +the parts relating to `self_ref` remain unchanged, we now have `output_ref` +point to the results of calling `Salsa20_encrypt_with_offset`: + +:::{literalinclude} code/salsa20/salsa20-apply_keystream_alt-fail1.saw +:lines: 169-172 +:language: sawscript +::: + +Tying this all together, we call `mir_verify`, making sure to use compositional +overrides involving `counter_setup` and `rounds`: + +:::{literalinclude} code/salsa20/salsa20-apply_keystream_alt-fail1.saw +:lines: 175-178 +:language: sawscript +::: + +At long last, it is time to run SAW on this. When we do, we see this: + +:::{code-block} console +[15:11:44.576] Checking proof obligations salsa20/10e438b3::core#1[0]::{impl#0}[0]::apply_keystream[0]::_inst6e4a2d7250998ef7[0] ... +::: + +After this, SAW loops forever. Oh no! While somewhat disheartening, this is a +reality of SMT-based verification that we must content with. SMT solvers are +extremely powerful, but their performance can sometimes be unpredictable. The +task of verifying `apply_keystream_alt_spec` is _just_ complicated enough that +Z3 cannot immediately figure out that the proof is valid, so it resorts to much +slower algorithms to solve proof goals. + +We could try waiting for Z3 to complete, but we'd be waiting for a long time. +It's not unheard of for SMT solvers to take many hours on especially hard +problems, but we don't have that many hours to spare. We should try a slightly +different approach instead. + +When confronted with an infinite loop in SAW, there isn't a one-size-fits-all +solution that will cure the problem. Sometimes, it is worth stating your SAW +spec in a slightly different way such that the SMT solver can spot patterns +that it couldn't before. Other times, it can be useful to try and break the +problem up into smaller functions and use compositional verification to handle +the more complicated subfunctions. As we mentioned before, the performance of +SMT solvers in unpredictable, and it's not always obvious what the best +solution is. + +In this example, however, the problem lies with Z3 itself. As it turns out, +Yices (a different SMT solver) _can_ spot the patterns needed to prove +`apply_keystream_alt_spec` immediately. Fortunately, SAW includes support for +both Z3 and Yices. In order to switch from Z3 to Yices, swap out the `z3` proof +script with `yices`: + +:::{literalinclude} code/salsa20/salsa20-reference.saw +:lines: 175-178 +:language: sawscript +::: + +After doing this, SAW is leverage Yices to solve the proof goals almost +immediately: + +:::{code-block} console +[15:22:00.745] Proof succeeded! salsa20/10e438b3::core#1[0]::{impl#0}[0]::apply_keystream[0]::_instfa33e77d840484a0[0] +::: + +And with that, we're finally done! You've successfully completed a non-trivial +SAW exercise in writing some interesting proofs. Give yourself a well-deserved +pat on the back. + +The process of developing these proofs was bumpy at times, but that is to be +expected. You very rarely get a proof correct on the very first try, and when +SAW doesn't accept your proof, it is important to be able to figure out what +went wrong and how to fix it. This is a skill that takes some time to grow, but +with enough time and experience, you will be able to recognize common pitfalls. +This case study showed off some of these pitfalls, but there are likely others. diff --git a/doc/rust-tutorial/code/.gitignore b/doc/rust-verification-with-saw/code/.gitignore similarity index 100% rename from doc/rust-tutorial/code/.gitignore rename to doc/rust-verification-with-saw/code/.gitignore diff --git a/doc/rust-tutorial/code/Makefile b/doc/rust-verification-with-saw/code/Makefile similarity index 100% rename from doc/rust-tutorial/code/Makefile rename to doc/rust-verification-with-saw/code/Makefile diff --git a/doc/rust-tutorial/code/arrays-fail.saw b/doc/rust-verification-with-saw/code/arrays-fail.saw similarity index 100% rename from doc/rust-tutorial/code/arrays-fail.saw rename to doc/rust-verification-with-saw/code/arrays-fail.saw diff --git a/doc/rust-tutorial/code/arrays.linked-mir.json b/doc/rust-verification-with-saw/code/arrays.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/arrays.linked-mir.json rename to doc/rust-verification-with-saw/code/arrays.linked-mir.json diff --git a/doc/rust-tutorial/code/arrays.rs b/doc/rust-verification-with-saw/code/arrays.rs similarity index 100% rename from doc/rust-tutorial/code/arrays.rs rename to doc/rust-verification-with-saw/code/arrays.rs diff --git a/doc/rust-tutorial/code/arrays.saw b/doc/rust-verification-with-saw/code/arrays.saw similarity index 100% rename from doc/rust-tutorial/code/arrays.saw rename to doc/rust-verification-with-saw/code/arrays.saw diff --git a/doc/rust-tutorial/code/enums.linked-mir.json b/doc/rust-verification-with-saw/code/enums.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/enums.linked-mir.json rename to doc/rust-verification-with-saw/code/enums.linked-mir.json diff --git a/doc/rust-tutorial/code/enums.rs b/doc/rust-verification-with-saw/code/enums.rs similarity index 100% rename from doc/rust-tutorial/code/enums.rs rename to doc/rust-verification-with-saw/code/enums.rs diff --git a/doc/rust-tutorial/code/enums.saw b/doc/rust-verification-with-saw/code/enums.saw similarity index 100% rename from doc/rust-tutorial/code/enums.saw rename to doc/rust-verification-with-saw/code/enums.saw diff --git a/doc/rust-tutorial/code/first-example.linked-mir.json b/doc/rust-verification-with-saw/code/first-example.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/first-example.linked-mir.json rename to doc/rust-verification-with-saw/code/first-example.linked-mir.json diff --git a/doc/rust-tutorial/code/first-example.rs b/doc/rust-verification-with-saw/code/first-example.rs similarity index 100% rename from doc/rust-tutorial/code/first-example.rs rename to doc/rust-verification-with-saw/code/first-example.rs diff --git a/doc/rust-tutorial/code/generics-take-1.linked-mir.json b/doc/rust-verification-with-saw/code/generics-take-1.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/generics-take-1.linked-mir.json rename to doc/rust-verification-with-saw/code/generics-take-1.linked-mir.json diff --git a/doc/rust-tutorial/code/generics-take-1.rs b/doc/rust-verification-with-saw/code/generics-take-1.rs similarity index 100% rename from doc/rust-tutorial/code/generics-take-1.rs rename to doc/rust-verification-with-saw/code/generics-take-1.rs diff --git a/doc/rust-tutorial/code/generics-take-2.linked-mir.json b/doc/rust-verification-with-saw/code/generics-take-2.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/generics-take-2.linked-mir.json rename to doc/rust-verification-with-saw/code/generics-take-2.linked-mir.json diff --git a/doc/rust-tutorial/code/generics-take-2.rs b/doc/rust-verification-with-saw/code/generics-take-2.rs similarity index 100% rename from doc/rust-tutorial/code/generics-take-2.rs rename to doc/rust-verification-with-saw/code/generics-take-2.rs diff --git a/doc/rust-tutorial/code/overrides-fail.saw b/doc/rust-verification-with-saw/code/overrides-fail.saw similarity index 100% rename from doc/rust-tutorial/code/overrides-fail.saw rename to doc/rust-verification-with-saw/code/overrides-fail.saw diff --git a/doc/rust-tutorial/code/overrides-mut-fail.saw b/doc/rust-verification-with-saw/code/overrides-mut-fail.saw similarity index 100% rename from doc/rust-tutorial/code/overrides-mut-fail.saw rename to doc/rust-verification-with-saw/code/overrides-mut-fail.saw diff --git a/doc/rust-tutorial/code/overrides-mut.linked-mir.json b/doc/rust-verification-with-saw/code/overrides-mut.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/overrides-mut.linked-mir.json rename to doc/rust-verification-with-saw/code/overrides-mut.linked-mir.json diff --git a/doc/rust-tutorial/code/overrides-mut.rs b/doc/rust-verification-with-saw/code/overrides-mut.rs similarity index 100% rename from doc/rust-tutorial/code/overrides-mut.rs rename to doc/rust-verification-with-saw/code/overrides-mut.rs diff --git a/doc/rust-tutorial/code/overrides-mut.saw b/doc/rust-verification-with-saw/code/overrides-mut.saw similarity index 100% rename from doc/rust-tutorial/code/overrides-mut.saw rename to doc/rust-verification-with-saw/code/overrides-mut.saw diff --git a/doc/rust-tutorial/code/overrides-unsafe.saw b/doc/rust-verification-with-saw/code/overrides-unsafe.saw similarity index 100% rename from doc/rust-tutorial/code/overrides-unsafe.saw rename to doc/rust-verification-with-saw/code/overrides-unsafe.saw diff --git a/doc/rust-tutorial/code/overrides.linked-mir.json b/doc/rust-verification-with-saw/code/overrides.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/overrides.linked-mir.json rename to doc/rust-verification-with-saw/code/overrides.linked-mir.json diff --git a/doc/rust-tutorial/code/overrides.rs b/doc/rust-verification-with-saw/code/overrides.rs similarity index 100% rename from doc/rust-tutorial/code/overrides.rs rename to doc/rust-verification-with-saw/code/overrides.rs diff --git a/doc/rust-tutorial/code/overrides.saw b/doc/rust-verification-with-saw/code/overrides.saw similarity index 100% rename from doc/rust-tutorial/code/overrides.saw rename to doc/rust-verification-with-saw/code/overrides.saw diff --git a/doc/rust-tutorial/code/ref-basics-fail.saw b/doc/rust-verification-with-saw/code/ref-basics-fail.saw similarity index 100% rename from doc/rust-tutorial/code/ref-basics-fail.saw rename to doc/rust-verification-with-saw/code/ref-basics-fail.saw diff --git a/doc/rust-tutorial/code/ref-basics.linked-mir.json b/doc/rust-verification-with-saw/code/ref-basics.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/ref-basics.linked-mir.json rename to doc/rust-verification-with-saw/code/ref-basics.linked-mir.json diff --git a/doc/rust-tutorial/code/ref-basics.rs b/doc/rust-verification-with-saw/code/ref-basics.rs similarity index 100% rename from doc/rust-tutorial/code/ref-basics.rs rename to doc/rust-verification-with-saw/code/ref-basics.rs diff --git a/doc/rust-tutorial/code/ref-basics.saw b/doc/rust-verification-with-saw/code/ref-basics.saw similarity index 100% rename from doc/rust-tutorial/code/ref-basics.saw rename to doc/rust-verification-with-saw/code/ref-basics.saw diff --git a/doc/rust-tutorial/code/salsa20/.gitignore b/doc/rust-verification-with-saw/code/salsa20/.gitignore similarity index 100% rename from doc/rust-tutorial/code/salsa20/.gitignore rename to doc/rust-verification-with-saw/code/salsa20/.gitignore diff --git a/doc/rust-tutorial/code/salsa20/CHANGELOG.md b/doc/rust-verification-with-saw/code/salsa20/CHANGELOG.md similarity index 100% rename from doc/rust-tutorial/code/salsa20/CHANGELOG.md rename to doc/rust-verification-with-saw/code/salsa20/CHANGELOG.md diff --git a/doc/rust-tutorial/code/salsa20/Cargo.lock b/doc/rust-verification-with-saw/code/salsa20/Cargo.lock similarity index 100% rename from doc/rust-tutorial/code/salsa20/Cargo.lock rename to doc/rust-verification-with-saw/code/salsa20/Cargo.lock diff --git a/doc/rust-tutorial/code/salsa20/Cargo.toml b/doc/rust-verification-with-saw/code/salsa20/Cargo.toml similarity index 100% rename from doc/rust-tutorial/code/salsa20/Cargo.toml rename to doc/rust-verification-with-saw/code/salsa20/Cargo.toml diff --git a/doc/rust-tutorial/code/salsa20/LICENSE-APACHE b/doc/rust-verification-with-saw/code/salsa20/LICENSE-APACHE similarity index 100% rename from doc/rust-tutorial/code/salsa20/LICENSE-APACHE rename to doc/rust-verification-with-saw/code/salsa20/LICENSE-APACHE diff --git a/doc/rust-tutorial/code/salsa20/LICENSE-MIT b/doc/rust-verification-with-saw/code/salsa20/LICENSE-MIT similarity index 100% rename from doc/rust-tutorial/code/salsa20/LICENSE-MIT rename to doc/rust-verification-with-saw/code/salsa20/LICENSE-MIT diff --git a/doc/rust-tutorial/code/salsa20/README.md b/doc/rust-verification-with-saw/code/salsa20/README.md similarity index 100% rename from doc/rust-tutorial/code/salsa20/README.md rename to doc/rust-verification-with-saw/code/salsa20/README.md diff --git a/doc/rust-tutorial/code/salsa20/Salsa20.cry b/doc/rust-verification-with-saw/code/salsa20/Salsa20.cry similarity index 100% rename from doc/rust-tutorial/code/salsa20/Salsa20.cry rename to doc/rust-verification-with-saw/code/salsa20/Salsa20.cry diff --git a/doc/rust-tutorial/code/salsa20/Salsa20Extras.cry b/doc/rust-verification-with-saw/code/salsa20/Salsa20Extras.cry similarity index 100% rename from doc/rust-tutorial/code/salsa20/Salsa20Extras.cry rename to doc/rust-verification-with-saw/code/salsa20/Salsa20Extras.cry diff --git a/doc/rust-tutorial/code/salsa20/salsa20-apply_keystream_alt-fail1.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-apply_keystream_alt-fail1.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-apply_keystream_alt-fail1.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-apply_keystream_alt-fail1.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-new_raw-fail1.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-new_raw-fail1.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-new_raw-fail1.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-new_raw-fail1.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-new_raw-fail2.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-new_raw-fail2.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-new_raw-fail2.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-new_raw-fail2.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-quarter_round-fail1.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-quarter_round-fail1.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-quarter_round-fail1.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-quarter_round-fail1.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-quarter_round-fail2.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-quarter_round-fail2.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-quarter_round-fail2.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-quarter_round-fail2.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-quarter_round-fail3.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-quarter_round-fail3.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-quarter_round-fail3.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-quarter_round-fail3.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-quarter_round-fail4.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-quarter_round-fail4.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-quarter_round-fail4.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-quarter_round-fail4.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-reference.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-reference.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-reference.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-reference.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-rounds-take-1.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-rounds-take-1.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-rounds-take-1.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-rounds-take-1.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-rounds-take-2.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-rounds-take-2.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-rounds-take-2.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-rounds-take-2.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20-rounds-take-3.saw b/doc/rust-verification-with-saw/code/salsa20/salsa20-rounds-take-3.saw similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20-rounds-take-3.saw rename to doc/rust-verification-with-saw/code/salsa20/salsa20-rounds-take-3.saw diff --git a/doc/rust-tutorial/code/salsa20/salsa20.linked-mir.json.tar.gz b/doc/rust-verification-with-saw/code/salsa20/salsa20.linked-mir.json.tar.gz similarity index 100% rename from doc/rust-tutorial/code/salsa20/salsa20.linked-mir.json.tar.gz rename to doc/rust-verification-with-saw/code/salsa20/salsa20.linked-mir.json.tar.gz diff --git a/doc/rust-tutorial/code/salsa20/src/core.rs b/doc/rust-verification-with-saw/code/salsa20/src/core.rs similarity index 100% rename from doc/rust-tutorial/code/salsa20/src/core.rs rename to doc/rust-verification-with-saw/code/salsa20/src/core.rs diff --git a/doc/rust-tutorial/code/salsa20/src/lib.rs b/doc/rust-verification-with-saw/code/salsa20/src/lib.rs similarity index 100% rename from doc/rust-tutorial/code/salsa20/src/lib.rs rename to doc/rust-verification-with-saw/code/salsa20/src/lib.rs diff --git a/doc/rust-tutorial/code/salsa20/src/rounds.rs b/doc/rust-verification-with-saw/code/salsa20/src/rounds.rs similarity index 100% rename from doc/rust-tutorial/code/salsa20/src/rounds.rs rename to doc/rust-verification-with-saw/code/salsa20/src/rounds.rs diff --git a/doc/rust-tutorial/code/salsa20/src/salsa.rs b/doc/rust-verification-with-saw/code/salsa20/src/salsa.rs similarity index 100% rename from doc/rust-tutorial/code/salsa20/src/salsa.rs rename to doc/rust-verification-with-saw/code/salsa20/src/salsa.rs diff --git a/doc/rust-tutorial/code/salsa20/src/xsalsa.rs b/doc/rust-verification-with-saw/code/salsa20/src/xsalsa.rs similarity index 100% rename from doc/rust-tutorial/code/salsa20/src/xsalsa.rs rename to doc/rust-verification-with-saw/code/salsa20/src/xsalsa.rs diff --git a/doc/rust-tutorial/code/saw-basics.linked-mir.json b/doc/rust-verification-with-saw/code/saw-basics.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/saw-basics.linked-mir.json rename to doc/rust-verification-with-saw/code/saw-basics.linked-mir.json diff --git a/doc/rust-tutorial/code/saw-basics.rs b/doc/rust-verification-with-saw/code/saw-basics.rs similarity index 100% rename from doc/rust-tutorial/code/saw-basics.rs rename to doc/rust-verification-with-saw/code/saw-basics.rs diff --git a/doc/rust-tutorial/code/saw-basics.saw b/doc/rust-verification-with-saw/code/saw-basics.saw similarity index 100% rename from doc/rust-tutorial/code/saw-basics.saw rename to doc/rust-verification-with-saw/code/saw-basics.saw diff --git a/doc/rust-tutorial/code/slices.linked-mir.json b/doc/rust-verification-with-saw/code/slices.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/slices.linked-mir.json rename to doc/rust-verification-with-saw/code/slices.linked-mir.json diff --git a/doc/rust-tutorial/code/slices.rs b/doc/rust-verification-with-saw/code/slices.rs similarity index 100% rename from doc/rust-tutorial/code/slices.rs rename to doc/rust-verification-with-saw/code/slices.rs diff --git a/doc/rust-tutorial/code/slices.saw b/doc/rust-verification-with-saw/code/slices.saw similarity index 100% rename from doc/rust-tutorial/code/slices.saw rename to doc/rust-verification-with-saw/code/slices.saw diff --git a/doc/rust-tutorial/code/statics-compositional-fail.saw b/doc/rust-verification-with-saw/code/statics-compositional-fail.saw similarity index 100% rename from doc/rust-tutorial/code/statics-compositional-fail.saw rename to doc/rust-verification-with-saw/code/statics-compositional-fail.saw diff --git a/doc/rust-tutorial/code/statics-compositional.linked-mir.json b/doc/rust-verification-with-saw/code/statics-compositional.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/statics-compositional.linked-mir.json rename to doc/rust-verification-with-saw/code/statics-compositional.linked-mir.json diff --git a/doc/rust-tutorial/code/statics-compositional.rs b/doc/rust-verification-with-saw/code/statics-compositional.rs similarity index 100% rename from doc/rust-tutorial/code/statics-compositional.rs rename to doc/rust-verification-with-saw/code/statics-compositional.rs diff --git a/doc/rust-tutorial/code/statics-compositional.saw b/doc/rust-verification-with-saw/code/statics-compositional.saw similarity index 100% rename from doc/rust-tutorial/code/statics-compositional.saw rename to doc/rust-verification-with-saw/code/statics-compositional.saw diff --git a/doc/rust-tutorial/code/statics-fail.saw b/doc/rust-verification-with-saw/code/statics-fail.saw similarity index 100% rename from doc/rust-tutorial/code/statics-fail.saw rename to doc/rust-verification-with-saw/code/statics-fail.saw diff --git a/doc/rust-tutorial/code/statics.linked-mir.json b/doc/rust-verification-with-saw/code/statics.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/statics.linked-mir.json rename to doc/rust-verification-with-saw/code/statics.linked-mir.json diff --git a/doc/rust-tutorial/code/statics.rs b/doc/rust-verification-with-saw/code/statics.rs similarity index 100% rename from doc/rust-tutorial/code/statics.rs rename to doc/rust-verification-with-saw/code/statics.rs diff --git a/doc/rust-tutorial/code/statics.saw b/doc/rust-verification-with-saw/code/statics.saw similarity index 100% rename from doc/rust-tutorial/code/statics.saw rename to doc/rust-verification-with-saw/code/statics.saw diff --git a/doc/rust-tutorial/code/structs.linked-mir.json b/doc/rust-verification-with-saw/code/structs.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/structs.linked-mir.json rename to doc/rust-verification-with-saw/code/structs.linked-mir.json diff --git a/doc/rust-tutorial/code/structs.rs b/doc/rust-verification-with-saw/code/structs.rs similarity index 100% rename from doc/rust-tutorial/code/structs.rs rename to doc/rust-verification-with-saw/code/structs.rs diff --git a/doc/rust-tutorial/code/structs.saw b/doc/rust-verification-with-saw/code/structs.saw similarity index 100% rename from doc/rust-tutorial/code/structs.saw rename to doc/rust-verification-with-saw/code/structs.saw diff --git a/doc/rust-tutorial/code/times-two-ref-fail.saw b/doc/rust-verification-with-saw/code/times-two-ref-fail.saw similarity index 100% rename from doc/rust-tutorial/code/times-two-ref-fail.saw rename to doc/rust-verification-with-saw/code/times-two-ref-fail.saw diff --git a/doc/rust-tutorial/code/times-two.linked-mir.json b/doc/rust-verification-with-saw/code/times-two.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/times-two.linked-mir.json rename to doc/rust-verification-with-saw/code/times-two.linked-mir.json diff --git a/doc/rust-tutorial/code/times-two.rs b/doc/rust-verification-with-saw/code/times-two.rs similarity index 100% rename from doc/rust-tutorial/code/times-two.rs rename to doc/rust-verification-with-saw/code/times-two.rs diff --git a/doc/rust-tutorial/code/times-two.saw b/doc/rust-verification-with-saw/code/times-two.saw similarity index 100% rename from doc/rust-tutorial/code/times-two.saw rename to doc/rust-verification-with-saw/code/times-two.saw diff --git a/doc/rust-tutorial/code/tuples.linked-mir.json b/doc/rust-verification-with-saw/code/tuples.linked-mir.json similarity index 100% rename from doc/rust-tutorial/code/tuples.linked-mir.json rename to doc/rust-verification-with-saw/code/tuples.linked-mir.json diff --git a/doc/rust-tutorial/code/tuples.rs b/doc/rust-verification-with-saw/code/tuples.rs similarity index 100% rename from doc/rust-tutorial/code/tuples.rs rename to doc/rust-verification-with-saw/code/tuples.rs diff --git a/doc/rust-tutorial/code/tuples.saw b/doc/rust-verification-with-saw/code/tuples.saw similarity index 100% rename from doc/rust-tutorial/code/tuples.saw rename to doc/rust-verification-with-saw/code/tuples.saw diff --git a/doc/rust-verification-with-saw/compound-data-types.md b/doc/rust-verification-with-saw/compound-data-types.md new file mode 100644 index 0000000000..54400228f6 --- /dev/null +++ b/doc/rust-verification-with-saw/compound-data-types.md @@ -0,0 +1,532 @@ +# Compound data types + +Besides integer types and reference types, Rust also features a variety of other +interesting data types. This part of the tutorial will briefly go over some of +these data types and how to interface with them in SAW. + +## Array types + +Rust includes array types where the length of the array is known ahead of time. +For instance, this `index` function takes an `arr` argument that must contain +exactly three `u32` values: + +:::{literalinclude} code/arrays.rs +:lines: 1-3 +:language: rust +::: + +While Rust is good at catching many classes of programmer errors at compile +time, one thing that it cannot catch in general is out-of-bounds array +accesses. In this `index` example, calling the function with a value of `idx` +ranging from `0` to `2` is fine, but any other choice of `idx` will cause the +program to crash, since the `idx` will be out of the bounds of `arr`. + +SAW is suited to checking for these sorts of out-of-bound accesses. Let's write +an incorrect spec for `index` to illustrate this: + +:::{literalinclude} code/arrays-fail.saw +:lines: 3-14 +:language: sawscript +::: + +Before we run this with SAW, let's highlight some of the new concepts that this +spec uses: + +1. The type of the `arr` variable is specified using `mir_array 3 mir_u32`. + Here, the `mir_array` function takes the length of the array and the element + type as arguments, just as in Rust. +2. The spec declares the return value to be `{{ arr @ idx }}`, where `@` is + Cryptol's indexing operator. Also note that it is completely valid to embed + a MIR array type into a Cryptol expression, as Cryptol has a sequence type + that acts much like arrays do in MIR. + +As we hinted above, this spec is wrong, as it says that this should work for +_any_ possible values of `idx`. SAW will catch this mistake: + +:::{code-block} console +$ saw arrays-fail.saw + + + +[21:03:05.374] Loading file "arrays-fail.saw" +[21:03:05.411] Verifying arrays/47a26581::index[0] ... +[21:03:05.425] Simulating arrays/47a26581::index[0] ... +[21:03:05.426] Checking proof obligations arrays/47a26581::index[0] ... +[21:03:05.445] Subgoal failed: arrays/47a26581::index[0] index out of bounds: the length is move _4 but the index is _3 +[21:03:05.445] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 53} +[21:03:05.445] ----------Counterexample---------- +[21:03:05.445] idx: 2147483648 +[21:03:05.445] Stack trace: +"mir_verify" (arrays-fail.saw:14:1-14:11) +Proof failed. +::: + +We can repair this spec by adding some preconditions: + +:::{literalinclude} code/arrays.saw +:lines: 3-12 +:language: sawscript +::: + +An alternative way of writing this spec is by using SAW's `mir_array_value` +command: + +:::{code-block} console +sawscript> :type mir_array_value +MIRType -> [MIRValue] -> MIRValue +::: + +Here, the `MIRType` argument represents the element type, and the list of +`MIRValue` arguments are the element values of the array. We can rewrite +`index_spec` using `mir_array_value` like so: + +:::{literalinclude} code/arrays.saw +:lines: 18-30 +:language: sawscript +::: + +Here, `[arr0, arr1, arr2]` is Cryptol notation for constructing a length-3 +sequence consisting of `arr0`, `arr1`, and `arr2` as the elements. +`index_alt_spec` is equivalent to `index_spec`, albeit more verbose. For this +reason, it is usually preferable to use `mir_fresh_var` to create an entire +symbolic array rather than creating separate symbolic values for each element +and combining them with `mir_array_value`. + +There are some situations where `mir_array_value` is the only viable choice, +however. Consider this variant of the `index` function: + +:::{literalinclude} code/arrays.rs +:lines: 5-7 +:language: rust +::: + +When writing a SAW spec for `index_ref_arr`, we can't just create a symbolic +variable for `arr` using `mir_alloc (mir_array 3 ...)`, as the reference values +in the array wouldn't point to valid memory. Instead, we must individually +allocate the elements of `arr` using separate calls to `mir_alloc` and then +build up the array using `mir_array_value`. (As an exercise, try writing and +verifying a spec for `index_ref_arr`). + +## Tuple types + +Rust includes tuple types where the elements of the tuple can be of different +types. For example: + +:::{literalinclude} code/tuples.rs +:lines: 1-3 +:language: rust +::: + +SAW includes a `mir_tuple` function for specifying the type of a tuple value. +In addition, one can embed MIR tuples into Cryptol, as Cryptol also includes +tuple types whose fields can be indexed with `.0`, `.1`, etc. Here is a spec +for `flip` that makes use of all these features: + +:::{literalinclude} code/tuples.saw +:lines: 3-9 +:language: sawscript +::: + +SAW also includes a `mir_tuple_value` function for constructing a tuple value +from other `MIRValue`s: + +:::{code-block} console +sawscript> :type mir_tuple_value +[MIRValue] -> MIRValue +::: + +`mir_tuple_value` plays a similar role for tuples as `mir_array_value` does for +arrays. + +## Struct types + +Rust supports the ability for users to define custom struct types. Structs are +uniquely identified by their names, so if you have two structs like these: + +:::{literalinclude} code/structs.rs +:lines: 1-2 +:language: rust +::: + +Then even though the fields of the `S` and `T` structs are the same, they are +_not_ the same struct. This is a type system feature that Cryptol does not +have, and for this reason, it is not possible to embed MIR struct values into +Cryptol. It is also not possible to use `mir_fresh_var` to create a symbolic +struct value. Instead, one can use the `mir_struct_value` command: + +:::{code-block} console +sawscript> :type mir_struct_value +MIRAdt -> [MIRValue] -> MIRValue +::: + +Like with `mir_array_value` and `mir_tuple_value`, the `mir_struct_value` +function takes a list of `MIRValue`s as arguments. What makes +`mir_struct_value` unique is its `MIRAdt` argument, which we have not seen up +to this point. In this context, "`Adt`" is shorthand for "[algebraic data +type](https://en.wikipedia.org/wiki/Algebraic_data_type)", and Rust's structs +are an example of ADTs. (Rust also supports enums, another type of ADT that we +will see later in this tutorial.) + +ADTs in Rust are named entities, and as such, they have unique identifiers in +the MIR JSON file in which they are defined. Looking up these identifiers can +be somewhat error-prone, so SAW offers a `mir_find_adt` command that computes +an ADT's identifier and returns the `MIRAdt` associated with it: + +:::{code-block} console +sawscript> :type mir_find_adt +MIRModule -> String -> [MIRType] -> MIRAdt +::: + +Here, `MIRModule` correspond to the MIR JSON file containing the ADT +definition, and the `String` is the name of the ADT whose identifier we want to +look up. The list of `MIRType`s represent types to instantiate any type +parameters to the struct (more on this in a bit). + +As an example, we can look up the `S` and `T` structs from above like so: + +:::{literalinclude} code/structs.saw +:lines: 3-6 +:language: sawscript +::: + +We pass an empty list of `MIRType`s to each use of `mir_find_adt`, as neither +`S` nor `T` have any type parameters. An example of a struct that does include +type parameters can be seen here: + +:::{literalinclude} code/structs.rs +:lines: 12 +:language: rust +::: + +As mentioned before, SAW doesn't support generic definitions out of the box, so +the only way that we can make use of the `Foo` struct is by looking up a +particular instantiation of `Foo`'s type parameters. If we define a function +like this, for example: + +:::{literalinclude} code/structs.rs +:lines: 14-16 +:language: rust +::: + +Then this function instantiates `Foo`'s `A` type parameter with `u32` and the +`B` type parameter with `u64`. We can use `mir_find_adt` to look up this +particular instantiation of `Foo` like so: + +:::{literalinclude} code/structs.saw +:lines: 7 +:language: sawscript +::: + +In general, a MIR JSON file can have many separate instantiations of a single +struct's type parameters, and each instantiation must be looked up separately +using `mir_find_adt`. + +Having looked up `Foo` using `mir_find_adt`, let's use the resulting +`MIRAdt` in a spec: + +:::{literalinclude} code/structs.saw +:lines: 9-18 +:language: sawscript +::: + +Note that we are directly writing out the values `27` and `42` in Cryptol. +Cryptol's numeric literals can take on many different types, so in order to +disambiguate which type they should be, we give each numeric literal an +explicit type annotation. For instance, the expression `27 : [32]` means that +`27` should be a 32-bit integer. + +### Symbolic structs + +Let's now verify a function that takes a struct value as an argument: + +:::{literalinclude} code/structs.rs +:lines: 18-22 +:language: rust +::: + +Moreover, let's verify this function for all possible `Bar` values. One way to +do this is to write a SAW spec that constructs a struct value whose fields are +themselves symbolic: + +:::{literalinclude} code/structs.saw +:lines: 20-38 +:language: sawscript +::: + +This is a rather tedious process, however, as we had to repeatedly use +`mir_fresh_var` to create a fresh, symbolic value for each field. Moreover, +because `mit_fresh_var` does not work for structs, we had to recursively apply +this process in order to create a fresh `Foo` value. It works, but it takes a +lot of typing to accomplish. + +To make this process less tedious, SAW offers a `mir_fresh_expanded_value` +command that allows one to create symbolic values of many more types. While +`mir_fresh_var` is limited to those MIR types that can be directly converted to +Cryptol, `mir_fresh_expanded_value` can create symbolic structs by automating +the process of creating fresh values for each field. This process also applies +recursively for struct fields, such as the `Foo` field in `Bar`. + +As an example, a much shorter way to write the spec above using +`mir_fresh_expanded_value` is: + +:::{literalinclude} code/structs.saw +:lines: 42-48 +:language: sawscript +::: + +That's it! Note that the string `"b"` is used as a prefix for all fresh names +that `mir_fresh_expanded_value` generates, so if SAW produces a counterexample +involving this symbolic struct value, one can expect to see names such as +`b_0`, `b_1`, etc. for the fields of the struct. + +`mir_fresh_expanded_value` makes it easier to construct large, compound values +that consist of many smaller, inner values. The drawback is that you can't +refer to these inner values in the postconditions of a spec. As a result, there +are some functions for which `mir_fresh_expanded_value` isn't suitable, so keep +this in mind before reaching for it. + +## Enum types + +Besides structs, another form of ADT that Rust supports are enums. Each enum +has a number of different _variants_ that describe the different ways that an +enum value can look like. A famous example of a Rust enum is the `Option` type, +which is defined by the standard library like so: + +:::{code-block} rust +enum Option { + None, + Some(T), +} +::: + +`Option` is commonly used in Rust code to represent a value that may be present +(`Some`) or absent (`None`). For this reason, we will use `Option` as our +motivating example of an enum in this section. + +First, let's start by defining some functions that make use of `Option`'s +variants: + +:::{literalinclude} code/enums.rs +:lines: 1-7 +:language: rust +::: + +Both functions return an `Option` value, but each function returns a +different variant. In order to tell these variants apart, we need a SAW +function which can construct an enum value that allows the user to pick which +variant they want to construct. The `mir_enum_value function does exactly that: + +:::{code-block} console +sawscript> :type mir_enum_value +MIRAdt -> String -> [MIRValue] -> MIRValue +::: + +Like `mir_struct_value`, `mir_enum_value` also requires a `MIRAdt` argument in +order to discern which particular enum you want. Unlike `mir_struct_value`, +however, it also requires a `String` which variant of the enum you want. In the +case of `Option`, this `String` will either be `"None"` or `"Some"`. Finally, +the `[MIRValue]` arguments represent the fields of the enum variant. + +Let's now verify some enum-related code with SAW. First, we must look up the +`Option` ADT, which works just as if you had a struct type: + +:::{literalinclude} code/enums.saw +:lines: 5 +:language: sawscript +::: + +Next, we can use this ADT to construct enum values. We shall use +`mir_enum_value` to create a `Some` value in the spec for `i_found_something`: + +:::{literalinclude} code/enums.saw +:lines: 7-16 +:language: sawscript +::: + +Note that while we used the full identifier `core::option::Option` to look up +the `Option` ADT, we do not need to use the `core::option` prefix when +specifying the `"Some"` variant. This is because SAW already knows what the +prefix should be from the `option_u32` ADT, so the `"Some"` shorthand suffices. + +Similarly, we can also write a spec for `i_got_nothing`, which uses the `None` +variant: + +:::{literalinclude} code/enums.saw +:lines: 18-25 +:language: sawscript +::: + +### Symbolic enums + +In order to create a symbolic struct, one could create symbolic fields and pack +them into a larger struct value using `mir_struct_value`. The same process is +not possible with `mir_enum_value`, however, as a symbolic enum value would +need to range over _all_ possible variants in an enum. + +Just as `mir_fresh_expanded_value` supports creating symbolic structs, +`mir_fresh_expanded_value` also supports creating symbolic enum values. For +example, given this function that accepts an `Option` value as an +argument: + +:::{literalinclude} code/enums.rs +:lines: 9-11 +:language: rust +::: + +We can write a spec for this function that considers all possible `Option` +values like so: + +:::{literalinclude} code/enums.saw +:lines: 27-33 +:language: sawscript +::: + +Here, `o` can be a `None` value, or it can be a `Some` value with a symbolic +field. + +## Slices + +Slices are a particular type of reference that allow referencing contiguous +sequences of elements in a collection, such as an array. Unlike ordinary +references (e.g., `&u32`), SAW does not permit allocating a slice directly. +Instead, one must take a slice of an existing reference. To better illustrate +this distinction, consider this function: + +:::{literalinclude} code/slices.rs +:lines: 1-3 +:language: rust +::: + +`sum_of_prefix` takes a slice to a sequence of `u32`s as an argument, indexes +into the first two elements in the sequence, and adds them together. There are +many possible ways we can write a spec for this function, as the slice argument +may be backed by many different sequences. For example, the slice might be +backed by an array whose length is exactly two: + +:::{literalinclude} code/slices.rs +:lines: 6-8 +:language: rust +::: + +We could also make a slice whose length is longer than two: + +:::{literalinclude} code/slices.rs +:lines: 10-12 +:language: rust +::: + +Alternatively, the slice might be a subset of an array whose length is longer +than two: + +:::{literalinclude} code/slices.rs +:lines: 14-16 +:language: rust +::: + +All of these are valid ways of building the slice argument to `sum_of_prefix`. +Let's try to write SAW specifications that construct these different forms of +slices. To do so, we will need SAW functions that take a reference to a +collection (e.g., an array) and converts them into a slice reference. The +`mir_slice_value` function is one such function: + +:::{code-block} console +sawscript> :type mir_slice_value +MIRValue -> MIRValue +::: + +`mir_slice_value arr_ref` is the SAW equivalent of writing `arr_ref[..]`. That +is, if `arr_ref` is of type `&[T; N]`, then `mir_slice_value arr_ref` is of +type `&[T]`. Note that `arr_ref` must be a _reference_ to an array, not an +array itself. + +Let's use `mir_slice_value` to write a spec for `sum_of_prefix` when the slice +argument is backed by an array of length two: + +:::{literalinclude} code/slices.saw +:lines: 5-15 +:language: sawscript +::: + +The first part of this spec allocates an array reference `a_ref` and declares +that it points to a fresh array value `a_val`. The next part declares a slice +`s` that is backed by the entirety of `a_ref`, which is then passed as an +argument to the function itself. Finally, the return value is declared to be +the sum of the first and second elements of `a_val`, which are the same values +that back the slice `s` itself. + +As noted above, the `sum_of_prefix` function can work with slices of many +different lengths. Here is a slight modification to this spec that declares it +to take a slice of length 5 rather than a slice of length 2: + +:::{literalinclude} code/slices.saw +:lines: 19-29 +:language: sawscript +::: + +Both of these examples declare a slice whose length matches the length of the +underlying array. In general, there is no reason that these have to be the +same, and it is perfectly fine for a slice's length to be less than the the +length of the underlying array. In Rust, for example, we can write a slice of a +subset of an array by writing `&arr_ref[0..2]`. The SAW equivalent of this can +be achieved with the `mir_slice_range_value` function: + +:::{code-block} console +sawscript> :type mir_slice_range_value +MIRValue -> Int -> Int -> MIRValue +::: + +`mir_slice_range_value` takes takes two additional `Int` arguments that +represent (1) the index to start the slice from, and (2) the index at which the +slice ends. For example, `mir_slice_range_value arr_ref 0 2` creates a slice +that is backed by the first element (index `0`) and the second element (index +`1`) of `arr_ref`. Note that the range `[0..2]` is half-open, so this range +does _not_ include the third element (index `2`). + +For example, here is how to write a spec for `sum_of_prefix` where the slice is +a length-2 subset of the original array: + +:::{literalinclude} code/slices.saw +:lines: 33-43 +:language: sawscript +::: + +Note that both `Int` arguments to `mir_slice_range_value` must be concrete +(i.e., not symbolic). (See the section below if you want an explanation for why +they are not allowed to be symbolic.) + +### Aside: slices of arbitrary length + +After reading the section about slices above, one might reasonably wonder: is +there a way to write a more general spec for `sum_of_prefix`: that covers all +possible slice lengths `n`, where `n` is greater than or equal to 2? In this +case, the answer is "no". + +This is a fundamental limitation of the way SAW's symbolic execution works. The +full reason for why this is the case is somewhat technical (keep reading if you +want to learn more), but the short answer is that if SAW attempts to +simulate code whose length is bounded by a symbolic integer, then SAW will go +into an infinite loop. To avoid this pitfall, the `mir_slice_range_value` +function very deliberately requires the start and end values to be concrete +integers, as allowing these values to be symbolic would allow users to +inadvertently introduce infinite loops in their specifications. + +A longer answer as to why SAW loops forever on computations that are bounded by +symbolic lengths: due to the way SAW's symblolic execution works, it creates a +complete model of the behavior of a function for all possible inputs. The way +that SAW achieves this is by exploring all possible execution paths through a +program. If a program involves a loop, for example, then SAW will unroll all +iterations of the loop to construct a model of the loop's behavior. Similarly, +if a sequence (e.g., a slice or array) has an unspecified length, then SAW must +consider all possible lengths of the array. + +SAW's ability to completely characterize the behavior of all paths through a +function is one of its strengths, as this allows it to prove theorems that +other program verification techniques would not. This strength is also a +weakness, however. If a loop has a symbolic number of iterations, for example, +then SAW will spin forever trying to unroll the loop. Similarly, if a slice +were to have a symbolic length, then SAW would spin forever trying to simulate +the program for all possible slice lengths. + +In general, SAW cannot prevent users from writing programs whose length is +bounded by a symbolic value. For now, however, SAW removes one potential +footgun by requiring that slice values always have a concrete length. diff --git a/doc/rust-verification-with-saw/index.md b/doc/rust-verification-with-saw/index.md new file mode 100644 index 0000000000..7c6e62c845 --- /dev/null +++ b/doc/rust-verification-with-saw/index.md @@ -0,0 +1,33 @@ +# Rust Verification with SAW + +This tutorial introduces SAW's Rust/MIR verification capabilities. +After completing this tutorial, you will be able to: + +- Use `saw-rustc` and `cargo-saw-build` to extract SAW-ready MIR from Rust +- Use SAW's MIR capabilities to verify code that utilizes references and + compound data types (i.e. `struct`s and `enum`s) +- Write proofs _compositionally_, even when the subtleties of mutable references + are involved +- Verify code that utilizes _static items_ + +Additionally, you will step through the process of verifying an implementation +of the Salsa20 stream cipher against a Cryptol reference specification, showing +how SAW's MIR capabilities may be used on a more realistic codebase. + +You can download {download-dir}`the tutorial files `. + +```{toctree} +:maxdepth: 2 +:caption: Contents + +introduction +prerequisites +about-mir-json +saw-basics +reference-types +compound-data-types +overrides-and-compositional-verification +static-items +case-study-salsa20 +a-final-word +``` diff --git a/doc/rust-verification-with-saw/introduction.md b/doc/rust-verification-with-saw/introduction.md new file mode 100644 index 0000000000..8b2976afd9 --- /dev/null +++ b/doc/rust-verification-with-saw/introduction.md @@ -0,0 +1,24 @@ +# Introduction + +SAW is a special-purpose programming environment developed by Galois to help +orchestrate and track the results of the large collection of proof tools +necessary for analysis and verification of complex software artifacts. + +SAW adopts the functional paradigm, and largely follows the structure of many +other typed functional languages, with some special features specifically +targeted at the coordination of verification and analysis tasks. + +This tutorial introduces the details of the language by walking through several +examples, and showing how simple verification tasks can be described. The +complete examples are available [on +GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code). +Most of the examples make use of inline specifications written in Cryptol, a +language originally designed for high-level descriptions of cryptographic +algorithms. For readers unfamiliar with Cryptol, various documents describing +its use are available [here](http://cryptol.net/documentation.html). + +This tutorial also include a [case study](./case-study-salsa20.md) on how to use +SAW to verify a real-world implementation of the Salsa20 stream cipher based on +the [`stream-ciphers`](https://github.com/RustCrypto/stream-ciphers) Rust +project. The code for this case study is also available [on +GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/rust-tutorial/code/salsa20). diff --git a/doc/rust-verification-with-saw/overrides-and-compositional-verification.md b/doc/rust-verification-with-saw/overrides-and-compositional-verification.md new file mode 100644 index 0000000000..4d2e998e5d --- /dev/null +++ b/doc/rust-verification-with-saw/overrides-and-compositional-verification.md @@ -0,0 +1,367 @@ +# Overrides and compositional verification + +Up until this point, all uses of `mir_verify` in this tutorial have provided an +empty list (`[]`) of overrides. This means that any time SAW has simulated a +function which calls another function, it will step into the definition of the +callee function and verify its behavior alongside the behavior of the callee +function. This is a fine thing to do, but it can be inefficient. For example, +consider a function like this: + +:::{literalinclude} code/overrides.rs +:lines: 5-9 +:language: rust +::: + +Here, the caller function `f` invokes the callee function `g` three separate +times. If we verify `f` with `mir_verify` as we have done up until this point, +then SAW must analyze the behavior of `g` three separate times. This is +wasteful, and especially so if `g` is a large and complicated function. + +This is where _compositional verification_ enters the picture. The idea behind +compositional verification is that when we prove properties of a caller +function, we can reuse properties that we have already proved about callee +functions. These properties are captured as _override specifications_, which +are also referred to by the shorthand term _overrides_. When a caller invokes a +callee with a corresponding override specification, the override's properties +are applied without needing to re-simulate the entire function. + +As it turns out, the command needed to produce an override specification is +already familiar to us—it's `mir_verify`! If you examine the type of this +command: + +:::{code-block} console +sawscript> :type mir_verify +MIRModule -> String -> [MIRSpec] -> Bool -> MIRSetup () -> ProofScript () -> TopLevel MIRSpec +::: + +The returned value is a `MIRSpec`, which captures the behavior of the function +that was verified as an override spec. This override can then be passed to +another call to `mir_verify` to use as part of a larger proof. + +Let's now try compositional verification in practice. To do so, we will first +prove a spec for the `g` function above. For demonstration purposes, we will +pick a simplistic implementation of `g`: + +:::{literalinclude} code/overrides.rs +:lines: 1-3 +:language: rust +::: + +Note that we don't really _have_ to use compositional verification when `g` is +this simple, as SAW is capable of reasoning about `g`'s behavior directly when +proving a spec for `f`. It's still worth going along with this exercise, +however, as the same principles of compositional verification apply whether the +implementation of `g` is small or large. + +The first step of compositional verification is to prove a spec for `g`, the +callee function: + +:::{literalinclude} code/overrides.saw +:lines: 5-13 +:language: sawscript +::: + +There's nothing that different about this particular proof from the proofs +we've seen before. The only notable difference is that we bind the result of +calling `mir_verify` to a `MIRSpec` value that we name `g_ov` (short for "`g` +override"). This part is important, as we will need to use `g_ov` shortly. + +The next step is to write a spec for `f`. Since `g` adds `1` to its argument, +`f` will add `3` to its argument: + +:::{literalinclude} code/overrides.saw +:lines: 15-21 +:language: sawscript +::: + +Again, nothing too surprising. Now let's prove `f` against `f_spec` by using +`g_ov` as a compositional override: + +:::{literalinclude} code/overrides.saw +:lines: 23 +:language: sawscript +::: + +Here, note that instead of passing an empty list (`[]`) as we have done before, +we now pass a list containing `g_ov`. This informs `mir_verify` that whenever +it simulates a call to `g`, it should reuse the properties captured in `g_ov`. +In general, we can pass as many overrides as we want (we will see examples of +this later in the tutorial), but for now, one override will suffice. + +Let's run the proof of `f` against `f_spec`, making sure to pay attention to +the output of SAW: + +:::{code-block} console +[19:06:17.392] Verifying overrides/96c5af24::f[0] ... +[19:06:17.406] Simulating overrides/96c5af24::f[0] ... +[19:06:17.407] Matching 1 overrides of overrides/96c5af24::g[0] ... +[19:06:17.407] Branching on 1 override variants of overrides/96c5af24::g[0] ... +[19:06:17.407] Applied override! overrides/96c5af24::g[0] +[19:06:17.407] Matching 1 overrides of overrides/96c5af24::g[0] ... +[19:06:17.407] Branching on 1 override variants of overrides/96c5af24::g[0] ... +[19:06:17.407] Applied override! overrides/96c5af24::g[0] +[19:06:17.407] Matching 1 overrides of overrides/96c5af24::g[0] ... +[19:06:17.407] Branching on 1 override variants of overrides/96c5af24::g[0] ... +[19:06:17.407] Applied override! overrides/96c5af24::g[0] +[19:06:17.407] Checking proof obligations overrides/96c5af24::f[0] ... +[19:06:17.422] Proof succeeded! overrides/96c5af24::f[0] +::: + +We've now proven `f` compositionally! The first two lines ("`Verifying ...`" +and "`Simulating ...`") and the last two lines ("`Checking proof obligations +...`" and "`Proof succeeded! ...`") are the same as before, but this time, we +have some additional lines of output in between: + +* Whenever SAW prints "`Matching overrides of `", that's when you + know that SAW is about to simulate a call to ``. At that point, SAW + will check to see how many overrides (``) for `` are available. +* Whenever SAW prints "`Branching on override variants of `", SAW is + trying to figure out which of the `` overrides to apply. In this example, + there is only a single override, so the choice is easy. In cases where there + are multiple overrides, however, SAW may have to work harder (possibly even + consulting an SMT solver) to figure out which override to use. +* If SAW successfully picks an override to apply, it will print + "`Applied override! ...`". + +In the example above, we used a single `g` override that applies for all +possible arguments. In general, however, there is no requirement that overrides +must work for all arguments. In fact, it is quite common for SAW verification +efforts to write different specifications for the same function, but with +different arguments. We can then provide multiple overrides for the same +function as part of a compositional verification, and SAW will be able to pick +the right override depending on the shape of the argument when invoking the +function being overridden. + +For example, let's suppose that we wrote different `g` specs, one where the +argument to `g` is even, and another where the argument to `g` is odd: + +:::{literalinclude} code/overrides.saw +:lines: 25-44 +:language: sawscript +::: + +We can then prove `f` compositionally by passing both of the `g` overrides to +`mir_verify`: + +:::{literalinclude} code/overrides.saw +:lines: 45 +:language: sawscript +::: + +Like before, this will successfully verify. The only different now is that SAW +will print output involving two overrides instead of just one: + +:::{code-block} console +[20:48:07.649] Simulating overrides/96c5af24::f[0] ... +[20:48:07.650] Matching 2 overrides of overrides/96c5af24::g[0] ... +[20:48:07.650] Branching on 2 override variants of overrides/96c5af24::g[0] ... +[20:48:07.652] Applied override! overrides/96c5af24::g[0] +... +::: + +Keep in mind that if you provide at least one override for a function as part +of a compositional verification, then SAW _must_ apply an override whenever it +invokes that function during simulation. If SAW cannot find a matching +override, then the verification will fail. For instance, consider what would +happen if you tried proving `f` like so: + +:::{literalinclude} code/overrides-fail.saw +:lines: 33 +:language: sawscript +::: + +This time, we supply one override for `g` that only matches when the argument +is even. This is a problem, as SAW will not be able to find a matching override +when the argument is odd. Indeed, SAW will fail to verify this: + +:::{code-block} console +[20:53:29.588] Verifying overrides/96c5af24::f[0] ... +[20:53:29.602] Simulating overrides/96c5af24::f[0] ... +[20:53:29.602] Matching 1 overrides of overrides/96c5af24::g[0] ... +[20:53:29.602] Branching on 1 override variants of overrides/96c5af24::g[0] ... +[20:53:29.603] Applied override! overrides/96c5af24::g[0] +[20:53:29.603] Matching 1 overrides of overrides/96c5af24::g[0] ... +[20:53:29.603] Branching on 1 override variants of overrides/96c5af24::g[0] ... +[20:53:29.604] Applied override! overrides/96c5af24::g[0] +[20:53:29.604] Matching 1 overrides of overrides/96c5af24::g[0] ... +[20:53:29.604] Branching on 1 override variants of overrides/96c5af24::g[0] ... +[20:53:29.605] Applied override! overrides/96c5af24::g[0] +[20:53:29.605] Symbolic simulation completed with side conditions. +[20:53:29.606] Checking proof obligations overrides/96c5af24::f[0] ... +[20:53:29.623] Subgoal failed: overrides/96c5af24::f[0] No override specification applies for overrides/96c5af24::g[0]. +Arguments: +- c@26:bv +Run SAW with --sim-verbose=3 to see a description of each override. +[20:53:29.623] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 388} +[20:53:29.624] ----------Counterexample---------- +[20:53:29.624] x: 1 +... +Proof failed. +::: + +Here, we can see that `No override specification applies`, and SAW also +generates a counterexample of `x: 1`. Sure enough, `1` is an odd number! + +## Overrides and mutable references + +Compositional overrides provide great power, as they effectively allow you to +skip over certain functions when simulating them and replace them with simpler +implementations. With great power comes great responsibility, however. In +particular, one must be careful when using overrides for functions that modify +mutable references. If an override does not properly capture the behavior of a +mutable reference, it could potentially lead to incorrect proofs. + +This is the sort of thing that is best explained with an example, so consider +these two functions: + +:::{literalinclude} code/overrides-mut.rs +:lines: 1-9 +:language: rust +::: + +The `side_effect` function does not return anything interesting; it is only +ever invoked to perform a side effect of changing the mutable reference `a` to +point to `0`. The `foo` function invokes `side_effect`, and as a result, it +will always return `0`, regardless of what the argument to `foo` is. No +surprises just yet. + +Now let's make a first attempt at verifying `foo` using compositional +verification. First, we will write a spec for `side_effect`: + +:::{literalinclude} code/overrides-mut-fail.saw +:lines: 5-10 +:language: sawscript +::: + +`side_effect_spec` is somewhat odd. Although it goes through the effort of +allocating a mutable reference `a_ref` and initializing it, nothing about this +spec states that `a_ref` will point to `0` after the function has been invoked. +This omission is strange, but not outright wrong—the spec just underspecifies +what the behavior of the function is. Indeed, SAW will successfully verify this +spec using `mir_verify`: + +:::{literalinclude} code/overrides-mut-fail.saw +:lines: 18 +:language: sawscript +::: + +Next, let's try to write a spec for `foo`: + +:::{literalinclude} code/overrides-mut-fail.saw +:lines: 12-16 +:language: sawscript +::: + +At this point, alarm bells should be going off in your head. This spec +incorrectly states that `foo(x)` should return `x`, but it should actually +return `0`! This looks wrong, but consider what would happen if you tried to +verify this compositionally using our `side_effect_ov` override: + +:::{literalinclude} code/overrides-mut-fail.saw +:lines: 19 +:language: sawscript +::: + +If SAW were to simulate `foo(x)`, it would invoke create a temporary variable +`b` and assign it to the value `x`, and then it would invoke `side_effect(&mut +b)`. At this point, the `side_effect_ov` override would apply. According to +`side_effect_spec`, the argument to `side_effect` is not modified at all after +the function returns. This means that when the `foo` function returns `b`, it +will still retain its initial value of `x`. This shows that if we were to use +`side_effect_ov`, we could prove something that's blatantly false! + +Now that we've made you sweat a little bit, it's time for some good news: SAW +won't _actually_ let you prove `foo_spec`. If you try this compositional proof +in practice, SAW will catch your mistake: + +:::{code-block} console +[14:50:29.170] Verifying overrides_mut/11e47708::foo[0] ... +[14:50:29.181] Simulating overrides_mut/11e47708::foo[0] ... +[14:50:29.181] Matching 1 overrides of overrides_mut/11e47708::side_effect[0] ... +[14:50:29.181] Branching on 1 override variants of overrides_mut/11e47708::side_effect[0] ... +... +State of memory allocated in precondition (at overrides-mut-fail.saw:6:12) not described in postcondition +::: + +The line of code that SAW points to in the "`State of memory ...`" error +message is: + +:::{literalinclude} code/overrides-mut-fail.saw +:lines: 6 +:language: sawscript +::: + +SAW informs us that although we allocated the mutable reference `a_ref`, we +never indicated what it should point to after the function has returned. This +is an acceptable (if somewhat unusual) thing to do when verifying +`side_effect_spec` using `mir_verify`, but it is _not_ acceptable to do this +when using this spec as an override. To avoid unsound behavior like what is +described above, any override that allocates a mutable reference in its +preconditions _must_ declare what its value should be in the postconditions, no +exceptions. + +Thankfully, repairing this spec is relatively straightforward. Simply add a +`mir_points_to` statement in the postconditions of `side_effect_spec`: + +:::{literalinclude} code/overrides-mut.saw +:lines: 5-13 +:language: sawscript +::: + +Then use the correct return value in `foo_spec`: + +:::{literalinclude} code/overrides-mut.saw +:lines: 15-21 +:language: sawscript +::: + +And now the compositional proof of `foo_spec` works! + +## Unsafe overrides + +Now that we've made it this far into the tutorial, it's time to teach you a +more advanced technique: _unsafe_ overrides. Up until this point, we have +relied on SAW to check all of our work, and this is usually what you'd want +from a formal verification tool. In certain circumstances, however, it can be +useful to say "I know what I'm doing, SAW—just believe me when I say this spec +is valid!" In order to say this, you can use `mir_unsafe_assume_spec`: + +:::{code-block} console +sawscript> :type mir_unsafe_assume_spec +MIRModule -> String -> MIRSetup () -> TopLevel MIRSpec +::: + +`mir_unsafe_assume_spec` is `mir_verify`'s cousin who likes to live a little +more dangerously. Unlike `mir_verify`, the specification that you pass to +`mir_unsafe_assume_spec` (the `MIRSetup ()` argument) is _not_ checked for full +correctness. That is, `mir_unsafe_assume_spec` will bypass SAW's usual symbolic +execution pipeline, which is why one does not need to pass a `ProofScript` +argument (e.g., `z3`) to `mir_unsafe_assume_spec`. SAW will believe whatever +spec you supply `mir_unsafe_assume_spec` to be valid, and the `MIRSpec` that +`mir_unsafe_assume_spec` returns can then be used in later compositional +verifications. + +Why would you want to do this? The main reason is that writing proofs can be +difficult, and sometimes, there are certain functions in a SAW verification +effort that are disproportionately harder to write a spec for than others. It +is tempting to write specs for each function in sequence, but this can run the +risk of getting stuck on a particularly hard-to-verify function, blocking +progress on other parts of the proofs. + +In these situations, `mir_unsafe_assume_spec` can be a useful prototyping tool. +One can use `mir_unsafe_assume_spec` to assume a spec for the hard-to-verify +function and then proceed with the remaining parts of the proof. Of course, you +should make an effort to go back and prove the hard-to-verify function's spec +later, but it can be nice to try something else first. + +For example, here is how one can unsafely assume `g_spec` and use it in a +compositional proof of `f_spec`: + +:::{literalinclude} code/overrides-unsafe.saw +:lines: 21-22 +:language: sawscript +::: + +It should be emphasized that when we say "`unsafe`", we really mean it. +`mir_unsafe_assume_spec` can be used to prove specs that are blatantly wrong, +so use it with caution. diff --git a/doc/rust-verification-with-saw/prerequisites.md b/doc/rust-verification-with-saw/prerequisites.md new file mode 100644 index 0000000000..4704494eee --- /dev/null +++ b/doc/rust-verification-with-saw/prerequisites.md @@ -0,0 +1,12 @@ +# Prerequisites + +In order to run the examples in this tutorial, you will need to install the +following prerequisite tools: + +* SAW itself, which can be installed by following the instructions + [here](https://github.com/GaloisInc/saw-script#manual-installation). +* The Z3 and Yices SMT solvers. Z3 can be downloaded from + [here](https://github.com/Z3Prover/z3/releases), and Yices can be downloaded + from [here](https://yices.csl.sri.com/). +* The `mir-json` tool, which can be installed by following the instructions + [here](https://github.com/GaloisInc/mir-json#installation-instructions). diff --git a/doc/rust-verification-with-saw/reference-types.md b/doc/rust-verification-with-saw/reference-types.md new file mode 100644 index 0000000000..fd2b0e5a21 --- /dev/null +++ b/doc/rust-verification-with-saw/reference-types.md @@ -0,0 +1,127 @@ +# Reference types + +All of the examples we have seen up to this point involve simple integer types +such as `u8` and `u32`. While these are useful, Rust's type system features +much more than just integers. A key part of Rust's type system are its +reference types. For example, in this `read_ref` function: + +:::{literalinclude} code/ref-basics.rs +:lines: 1-3 +:language: rust +::: + +The function reads the value that `r` (of type `&u32`) points to and returns +it. Writing SAW specifications involving references is somewhat trickier than +with other types of values because we must also specify what memory the +reference points to. SAW provides a special command for doing this called +`mir_alloc`: + +:::{code-block} console +sawscript> :type mir_alloc +MIRType -> MIRSetup MIRValue +::: + +`mir_alloc` will allocate a reference value with enough space to hold a value +of the given `MIRType`. Unlike `mir_fresh_var`, `mir_alloc` returns a +`MIRValue` instead of a `Term`. We mentioned before that `Term`s are only a +subset of `MIRValue`s, and this is one of the reasons why. Cryptol does not +have a notion of reference values, but `MIRValue`s do. As a result, you cannot +embed the result of a call to `mir_alloc` in a Cryptol expression. + +`mir_alloc` must be used with some care. Here is a first, not-quite-correct +attempt at writing a spec for `read_ref` using `mir_alloc`: + +:::{literalinclude} code/ref-basics-fail.saw +:lines: 3-11 +:language: sawscript +::: + +As the comment suggests, it's not entirely clear what this spec should return. +We can't return `r`, since `read_ref` returns something of type `u32`, not +`&u32`. On the other hand, we don't have any values of type `u32` lying around +that are obviously the right thing to use here. Nevertheless, it's not required +for a SAW spec to include a `mir_return` statement, so let's see what happens +if we verify this as-is: + +:::{code-block} console +$ saw ref-basics-fail.saw + + + +[20:13:27.224] Loading file "ref-basics-fail.saw" +[20:13:27.227] Verifying ref_basics/54ae7b63::read_ref[0] ... +[20:13:27.235] Simulating ref_basics/54ae7b63::read_ref[0] ... +[20:13:27.235] Stack trace: +"mir_verify" (ref-basics-fail.saw:11:1-11:11) +Symbolic execution failed. +Abort due to assertion failure: + ref-basics.rs:2:5: 2:7: error: in ref_basics/54ae7b63::read_ref[0] + attempted to read empty mux tree +::: + +Clearly, SAW didn't like what we gave it. The reason this happens is although +we allocated memory for the reference `r`, we never told SAW what value should +live in that memory. When SAW simulated the `read_ref` function, it attempted +to dereference `r`, which pointed to uninitialized memory. This is constitutes +an error in SAW, which is what this "`attempted to read empty mux tree`" +business is about. + +SAW provides a `mir_points_to` command to declare what value a reference should +point to: + +:::{code-block} console +sawscript> :type mir_points_to +MIRValue -> MIRValue -> MIRSetup () +::: + +Here, the first `MIRValue` argument represents a reference value, and the +second `MIRValue` argument represents the value that the reference should point +to. In our spec for `read_ref`, we can declare that the reference should point +to a symbolic `u32` value like so: + +:::{literalinclude} code/ref-basics.saw +:lines: 3-9 +:language: sawscript +::: + +We have renamed `r` to `r_ref` in this revised spec to more easily distinguish +it from `r_val`, which is the value that `r_ref` is declared to point to using +`mir_points_to`. In this version of the spec, it is clear that we should return +`r_val` using `mir_return`, as `r_val` is exactly the value that will be +computed by dereferencing `r_ref`. + +This pattern, where a call to `mir_alloc`/`mir_alloc_mut` to followed by a call +to `mir_points_to`, is common with function specs that involve references. +Later in the tutorial, we will see other examples of `mir_points_to` where the +reference argument does not come from `mir_alloc`/`mir_alloc_mut`. + +The argument to `read_ref` is an immutable reference, so the implementation of +the function is not allowed to modify the memory that the argument points to. +Rust also features mutable references that do permit modifying the underlying +memory, as seen in this `swap` function: + +:::{literalinclude} code/ref-basics.rs +:lines: 5-11 +:language: rust +::: + +A corresponding spec for `swap` is: + +:::{literalinclude} code/ref-basics.saw +:lines: 15-28 +:language: sawscript +::: + +There are two interesting things worth calling out in this spec: + +1. Instead of allocating the reference values with `mir_alloc`, we instead use + `mir_alloc_mut`. This is a consequence of the fact that `&mut u32` is a + different type from `&mut` in Rust (and in MIR), and and such, we need a + separate `mir_alloc_mut` to get the types right. +2. This spec features calls to `mir_points_to` before _and_ after + `mir_execute_func`. This is because the values that `a_ref` and `b_ref` point + to before calling the function are different than the values that they point + to after calling the function. The two uses to `mir_points_to` after the + function has been called swap the order of `a_val` and `b_val`, reflecting + the fact that the `swap` function itself swaps the values that the references + point to. diff --git a/doc/rust-verification-with-saw/saw-basics.md b/doc/rust-verification-with-saw/saw-basics.md new file mode 100644 index 0000000000..69ce374c55 --- /dev/null +++ b/doc/rust-verification-with-saw/saw-basics.md @@ -0,0 +1,393 @@ +# SAW basics + +## A first SAW example + +We now have the knowledge necessary to compile Rust code in a way that is +suitable for SAW. Let's put our skills to the test and verify something! We will +build on the example from above, which we will put into a file named +`saw-basics.rs`: + +:::{literalinclude} code/saw-basics.rs +:language: rust +::: + +Our goal is to verify the correctness of the `id_u8` function. However, it is +meaningless to talk about whether a function is correct without having a +_specification_ for how the function should behave. This is where SAW enters +the picture. SAW provides a scripting language named _SAWScript_ that allows +you to write a precise specification for describing a function's behavior. For +example, here is a specification that captures the intended behavior of +`id_u8`: + +:::{literalinclude} code/saw-basics.saw +:lines: 3-7 +:language: sawscript +::: + +At a high level, this specification says that `id_u8` is a function that accepts +a single argument of type `u8`, and it returns its argument unchanged. Nothing +too surprising there, but this example illustrates many of the concepts that one +must use when working with SAW. Let's unpack what this is doing, line by line: + +* In SAWScript, specifications are ordinary values that are defined with `let`. + In this example, we are defining a specification named `id_u8_spec`. +* Specifications are defined using "`do`-notation". That is, they are assembled + by writing `do { ; ; ...; ; }`, where each `` is a + statement that declares some property about the function being verified. A + statement can optionally bind a variable that can be passed to later + statements, which is accomplished by writing ` <- `. +* The `x <- mir_fresh_var "x" mir_u8;` line declares that `x` is a fresh + variable of type `u8` (represented by `mir_u8` in SAWScript) that has some + unspecified value. In SAW parlance, we refer to these unspecified values as + _symbolic_ values. SAW uses an SMT solver under the hood to reason about + symbolic values. + + The `"x"` string indicates what name the variable `x` should have when sent + to the underlying SMT solver. This is primarily meant as a debugging aid, and + it is not required that the string match the name of the SAWScript variable. + (For instance, you could just as well have passed `"x_smt"` or something + else.) +* The `mir_execute_func [mir_term x];` line declares that the function should + be invoked with `x` as the argument. For technical reasons, we pass + `mir_term x` to `mir_execute_func` rather than just `x`; we will go over what + `mir_term` does later in the tutorial. +* Finally, the `mir_return (mir_term x);` line declares that the function should + return `x` once it has finished. + +Now that we have a specification in hand, it's time to prove that `id_u8` +actually adheres to the spec. To do so, we need to load the MIR JSON version of +`id_u8` into SAW, which is done with the `mir_load_module` command: + +:::{literalinclude} code/saw-basics.saw +:lines: 9 +:language: sawscript +::: + +This `m` variable contains the definition of `id_u8`, as well as the other code +defined in the program. We can then pass `m` to the `mir_verify` command, which +actually verifies that `id_u8` behaves according to `id_u8_spec`: + +:::{literalinclude} code/saw-basics.saw +:lines: 11 +:language: sawscript +::: + +Here is what is going on in this command: + +* The `m` and `"saw_basics::id_u8"` arguments instruct SAW to verify the + `id_u8` function located in the `saw_basics` crate defined in `m`. Note that + we are using the shorthand identifier notation here, so we are allowed to omit + the disambiguator for the `saw_basics` crate. +* The `[]` argument indicates that we will not provide any function overrides to + use when SAW simulates the `id_u8` function. (We will go over how overrides + work later in the tutorial.) +* The `false` argument indicates that SAW should not use path satisfiability + checking when analyzing the function. Path satisfiability checking is an + advanced SAW feature that we will not be making use of in this tutorial, so we + will always use `false` here. +* The `id_u8_spec` argument indicates that `id_u8` should be checked against the + specification defined by `id_u8_spec`. +* The `z3` argument indicates that SAW should use the Z3 SMT solver to solve + any proof goals that are generated during verification. SAW also supports + other SMT solvers, although we will mostly use Z3 in this tutorial. + +Putting this all together, our complete `saw-basics.saw` file is: + +:::{literalinclude} code/saw-basics.saw +:language: sawscript +::: + +One minor detail that we left out until just now is that the SAW's interface to +MIR is still experimental, so you must explicitly opt into it with the +`enable_experimental` command. + +Now that everything is in place, we can check this proof like so: + +:::{code-block} console +$ saw saw-basics.saw + + + +[16:14:07.006] Loading file "saw-basics.saw" +[16:14:07.009] Verifying saw_basics/f77ebf43::id_u8[0] ... +[16:14:07.017] Simulating saw_basics/f77ebf43::id_u8[0] ... +[16:14:07.017] Checking proof obligations saw_basics/f77ebf43::id_u8[0] ... +[16:14:07.017] Proof succeeded! saw_basics/f77ebf43::id_u8[0] +::: + +Tada! SAW was successfully able to prove that `id_u8` adheres to its spec. + +## Cryptol + +The spec in the previous section is nice and simple. It's also not very +interesting, as it's fairly obvious at a glance that `id_u8`'s implementation +is correct. Most of the time, we want to verify more complicated functions +where the correspondence between the specification and the implementation is +not always so clear. + +For example, consider this function, which multiplies a number by two: + +:::{literalinclude} code/times-two.rs +:lines: 1-3 +:language: rust +::: + +The straightforward way to implement this function would be to return `2 * x`, +but the author of this function _really_ cared about performance. As such, the +author applied a micro-optimization that computes the multiplication with a +single left-shift (`<<`). This is the sort of scenario where we are pretty sure +that the optimized version of the code is equivalent to the original version, +but it would be nice for SAW to check this. + +Let's write a specification for the `times_two` function: + +:::{literalinclude} code/times-two.saw +:lines: 3-7 +:language: sawscript +::: + +This spec introduces code delimited by double curly braces `{{ ... }}`, which +is a piece of syntax that we haven't seen before. The code in between the curly +braces is written in [Cryptol](http://cryptol.net/documentation.html), a +language designed for writing high-level specifications of various algorithms. +Cryptol supports most arithmetic operations, so `2 * x` works exactly as you +would expect. Also note that the `x` variable was originally bound in the +SAWScript language, but it is possible to embed `x` into the Cryptol language +by referencing `x` within the curly braces. (We'll say more about how this +embedding works later.) + +`{{ 2 * x }}` takes the Cryptol expression `2 * x` and lifts it to a SAW +expression. As such, this SAW spec declares that the function takes a single +`u32`-typed argument `x` and returns `2 * x`. We could have also wrote the +specification to declare that the function returns `x << 1`, but that would +have defeated the point of this exercise: we specifically want to check that +the function against a spec that is as simple and readable as possible. + +Our full SAW file is: + +:::{literalinclude} code/times-two.saw +:lines: 1-11 +:language: sawscript +::: + +Which we can verify is correct like so: + +:::{code-block} console +$ saw times-two.saw + + + +[17:51:35.469] Loading file "times-two.saw" +[17:51:35.497] Verifying times_two/6f4e41af::times_two[0] ... +[17:51:35.512] Simulating times_two/6f4e41af::times_two[0] ... +[17:51:35.513] Checking proof obligations times_two/6f4e41af::times_two[0] ... +[17:51:35.527] Proof succeeded! times_two/6f4e41af::times_two[0] +::: + +Nice! Even though the `times_two` function does not literally return `2 * x`, +SAW is able to confirm that the function behaves as if it were implemented that +way. + +## `Term`s and other types + +Now that we know how Cryptol can be used within SAW, we can go back and explain +what the `mir_term` function does. It is helpful to examine the type of +`mir_term` by using SAW's interactive mode. To do so, run the `saw` binary +without any other arguments: + +:::{code-block} console +$ saw +::: + +Then run `enable_experimental` (to enable MIR-related commands) and run `:type +mir_term`: + +:::{code-block} console +sawscript> enable_experimental +sawscript> :type mir_term +Term -> MIRValue +::: + +Here, we see that `mir_term` accepts a `Term` as an argument and returns a +`MIRValue`. In this context, the `Term` type represents a Cryptol value, and +the `MIRValue` type represents SAW-related MIR values. `Term`s can be thought +of as a subset of `MIRValue`s, and the `mir_term` function is used to promote a +`Term` to a `MIRValue`. + +Most other MIR-related commands work over `MIRValue`s, as can be seen with +SAW's `:type` command: + +:::{code-block} console +sawscript> :type mir_execute_func +[MIRValue] -> MIRSetup () +sawscript> :type mir_return +MIRValue -> MIRSetup () +::: + +Note that `MIRSetup` is the type of statements in a MIR specification, and two +`MIRSetup`-typed commands can be chained together by using `do`-notation. +Writing `MIRSetup ()` means that the statement does not return anything +interesting, and the use of `()` here is very much analogous to how `()` is +used in Rust. There are other `MIRSetup`-typed commands that _do_ return +something interesting, as is the case with `mir_fresh_var`: + +:::{code-block} console +sawscript> :type mir_fresh_var +String -> MIRType -> MIRSetup Term +::: + +This command returns a `MIRSetup Term`, which means that when you write `x <- +mir_fresh_var ... ...` in a MIR specification, then `x` will be bound at type +`Term`. + +Values of type `Term` have the property that they can be embedded into Cryptol +expression that are enclosed in double curly braces `{{ ... }}`. This is why +our earlier `{{ 2 * x }}` example works, as `x` is of type `Term`. + +## Preconditions and postconditions + +As a sanity check, let's write a naïve version of `times_two` that explicitly +returns `2 * x`: + +:::{literalinclude} code/times-two.rs +:lines: 5-7 +:language: rust +::: + +It seems like we should be able to verify this `times_two_ref` function using +the same spec that we used for `times_two`: + +:::{literalinclude} code/times-two-ref-fail.saw +:lines: 11 +:language: sawscript +::: + +Somewhat surprisingly, SAW fails to verify this function: + +:::{code-block} console +$ saw times-two-ref-fail.saw + + + +[18:58:22.578] Loading file "times-two-ref-fail.saw" +[18:58:22.608] Verifying times_two/56182919::times_two_ref[0] ... +[18:58:22.621] Simulating times_two/56182919::times_two_ref[0] ... +[18:58:22.622] Checking proof obligations times_two/56182919::times_two_ref[0] ... +[18:58:22.640] Subgoal failed: times_two/56182919::times_two_ref[0] attempt to compute `const 2_u32 * move _2`, which would overflow +[18:58:22.640] SolverStats {solverStatsSolvers = fromList ["SBV->Z3"], solverStatsGoalSize = 375} +[18:58:22.640] ----------Counterexample---------- +[18:58:22.640] x: 2147483648 +[18:58:22.640] Stack trace: +"mir_verify" (times-two-ref-fail.saw:11:1-11:11) +Proof failed. +::: + +The "`which would overflow`" portion of the error message suggests what went +wrong. When a Rust program is compiled with debug settings (which is the +default for `rustc` and `saw-rustc`), arithmetic operations such as +multiplication will check if the result of the operation can fit in the +requested number of bits. If not, the program will raise an error. + +In this case, we must make the result of multiplication fit in a `u32`, which +can represent values in the range `0` to `2^^32 - 1` (where `^^` is Cryptol's +exponentiation operator). But it is possible to take a number in this range, +multiply it by two, and have the result fall outside of the range. In fact, SAW +gives us a counterexample with exactly this number: `2147483648`, which can +also be written as `2^^31`. Multiplying this by two yields `2^^32`, which is +just outside of the range of values expressible with `u32`. SAW's duties +include checking that a function cannot fail at runtime, so this function falls +afoul of that check. + +Note that we didn't have this problem with the original definition of +`times_two` because the semantics of `<<` are such that if the result is too +large to fit in the requested type, then the result will _overflow_, i.e., wrap +back around to zero and count up. This means that `(2^^31) << 1` evaluates to +`0` rather than raising an error. Cryptol's multiplication operation also +performs integer overflow (unlike Rust in debug settings), which is why we +didn't notice any overflow-related issues when verifying `times_two`. + +There are two possible ways that we can repair this. One way is to rewrite +`times_two_ref` to use Rust's +[`wrapping_mul`](https://doc.rust-lang.org/std/primitive.u32.html#method.wrapping_mul) +function, a variant of multiplication that always uses integer overflow. This +work around the issue, but it is a bit more verbose. + +The other way is to make our spec more precise such that we only verify +`times_two_ref` for particular inputs. Although `times_two_ref` will run into +overflow-related issues when the argument is `2^^31` or greater, it is +perfectly fine for inputs smaller than `2^^31`. We can encode such an +assumption in SAW by adding a _precondition_. To do so, we write a slightly +modified version of `times_two_spec`: + +:::{literalinclude} code/times-two.saw +:lines: 13-18 +:language: sawscript +::: + +The most notable change is the `mir_precond {{ x < 2^^31 }};` line. +`mir_precond` (where "`precond`" is short for "precondition") is a command that +takes a `Term` argument that contains a boolean predicate, such as `{{ x < +2^^31 }}`. Declaring a precondition requires that this predicate must hold +during verification, and any values of `x` that do not satisfy this predicate +are not considered. + +By doing this, we have limited the range of the function from `0` to `2^^31 - +1`, which is exactly the range of values for which `times_two_ref` is well +defined. SAW will confirm this if we run it: + +:::{literalinclude} code/times-two.saw +:lines: 20 +:language: sawscript +::: + +:::{code-block} console +[19:23:53.480] Verifying times_two/56182919::times_two_ref[0] ... +[19:23:53.496] Simulating times_two/56182919::times_two_ref[0] ... +[19:23:53.497] Checking proof obligations times_two/56182919::times_two_ref[0] ... +[19:23:53.531] Proof succeeded! times_two/56182919::times_two_ref[0] +::: + +We can add as many preconditions to a spec as we see fit. For instance, if we +only want to verify `times_two_ref` for positive integers, we could add an +additional assumption: + +:::{literalinclude} code/times-two.saw +:lines: 22-28 +:language: sawscript +::: + +In addition to preconditions, SAW also supports postconditions. Whereas +preconditions represent conditions that must hold _before_ invoking a function, +postconditions represent conditions that must hold _after_ invoking a function. +We have already seen one type of postcondition in the form of the `mir_return` +command, which imposes a postcondition on what the return value must be equal +to. + +We can introduce additional postconditions with the `mir_postcond` command. +For example, if we call `times_two_ref` with a positive argument, then it +should be the case that the return value should be strictly greater than the +argument value. We can check for this using `mir_postcond` like so: + +:::{literalinclude} code/times-two.saw +:lines: 32-39 +:language: sawscript +::: + +An additional convenience that SAW offers is the `mir_assert` command. +`mir_assert` has the same type as `mir_precond` and `mir_postcond`, but +`mir_assert` can be used to declare both preconditions _and_ postconditions. +The difference is where `mir_assert` appears in a specification. If +`mir_assert` is used before the call to `mir_execute_func`, then it declares a +precondition. If `mir_assert` is used after the call to `mir_execute_func`, +then it declares a postcondition. + +For example, we can rewrite `times_two_ref_positive_postcond_spec` to use +`mir_assert`s like so: + +:::{literalinclude} code/times-two.saw +:lines: 43-50 +:language: sawscript +::: + +The choice of whether to use `mir_precond`/`mir_postcond` versus `mir_assert` is +mostly a matter personal taste. diff --git a/doc/rust-verification-with-saw/static-items.md b/doc/rust-verification-with-saw/static-items.md new file mode 100644 index 0000000000..be1fcdf5f4 --- /dev/null +++ b/doc/rust-verification-with-saw/static-items.md @@ -0,0 +1,240 @@ +# Static items + +Sometimes, Rust code makes use of [_static +items_](https://doc.rust-lang.org/reference/items/static-items.html), which are +definitions that are defined in a precise memory location for the entire +duration of the program. As such, static items can be thought of as a form of +global variables. + +## Immutable static items + +There are two kinds of static items in Rust: mutable static items (which have a +`mut` keyword) and immutable static items (which lack `mut`). Immutable static +items are much easier to deal with, so let's start by looking at an example of +a program that uses immutable static data: + +:::{literalinclude} code/statics.rs +:lines: 1-5 +:language: rust +::: + +This function will return `ANSWER`, i.e., `42`. We can write a spec that says +as much: + +:::{literalinclude} code/statics.saw +:lines: 5-9 +:language: sawscript +::: + +This works, but it is somewhat unsatisfying, as it requires hard-coding the +value of `ANSWER` into the spec. Ideally, we'd not have to think about the +precise implementation of static items like `ANSWER`. Fortunately, SAW makes +this possible by providing a `mir_static_initializer` function which computes +the initial value of a static item at the start of the program: + +:::{code-block} console +sawscript> :type mir_static_initializer +String -> MIRValue +::: + +In this case, `mir_static_initializer "statics::ANSWER"` is equivalent to +writing `mir_term {{ 42 : [32] }}`, so this spec is also valid: + +:::{literalinclude} code/statics.saw +:lines: 14-18 +:language: sawscript +::: + +Like `mir_verify`, the `mir_static_initializer` function expects a full +identifier as an argument, so we must write `"statics::ANSWER"` instead of +just `"ANSWER". + +At the MIR level, there is a unique reference to every static item. You can +obtain this reference by using the `mir_static` function: + +:::{code-block} console +sawscript> :type mir_static +String -> MIRValue +::: + +Here is one situation in which you would need to use a _reference_ to a static +item (which `mir_static` computes) rather than the _value_ of a static item +(which `mir_static_initializer` computes): + +:::{literalinclude} code/statics.rs +:lines: 7-9 +:language: rust +::: + +A spec for this function would look like this: + +:::{literalinclude} code/statics.saw +:lines: 23-27 +:language: sawscript +::: + +That's about all there is to say regarding immutable static items. + +## Mutable static items + +Mutable static items are a particularly tricky aspect of Rust. Including a +mutable static item in your program is tantamount to having mutable global +state that any function can access and modify. They are so tricky, in fact, +that Rust does not even allow you to use them unless you surround them in an +`unsafe` block: + +:::{literalinclude} code/statics.rs +:lines: 11-15 +:language: rust +::: + +The `mir_static_initializer` and `mut_static` functions work both immutable and +mutable static items, so we can write specs for mutable items using mostly the +same techniques as for writing specs for immutable items. We must be careful, +however, as SAW is pickier when it comes to specifying the initial values of +mutable static items. For example, here is naïve attempt at porting the spec +for `answer_to_the_ultimate_question` over to its mutable static counterpart, +`mut_answer_to_the_ultimate_question`: + +:::{literalinclude} code/statics-fail.saw +:lines: 5-9 +:language: sawscript +::: + +This looks plausible, but SAW will fail to verify it: + +:::{code-block} console +[21:52:32.738] Verifying statics/28a97e47::mut_answer_to_the_ultimate_question[0] ... +[21:52:32.745] Simulating statics/28a97e47::mut_answer_to_the_ultimate_question[0] ... +... +Symbolic execution failed. +Abort due to assertion failure: + statics.rs:14:14: 14:24: error: in statics/28a97e47::mut_answer_to_the_ultimate_question[0] + attempted to read empty mux tree +::: + +Oh no! Recall that we have seen this "`attempted to read empty mux tree`" error +message once before when discussing reference types. This error arose when we +attempted to read from uninitialized memory from a reference value. The same +situation applies here. A static item is backed by a reference, and SAW +deliberately does _not_ initialize the memory that a mutable static reference +points to upon program startup. Since we did not initialize `MUT_ANSWER`'s +reference value in the preconditions of the spec, SAW crashes at simulation +time when it attempts to read from the uninitialized memory. + +The solution to this problem is to perform this initialization explicitly using +`mir_points_to` in the preconditions of the spec. For example, this is a valid +spec: + +:::{literalinclude} code/statics.saw +:lines: 32-40 +:language: sawscript +::: + +We don't necessarily have to use `mir_static_initializer` as the starting value +for `MUT_ANSWER`, however. This spec, which uses `27` as the starting value, is +equally valid: + +:::{literalinclude} code/statics.saw +:lines: 45-52 +:language: sawscript +::: + +At this point, you are likely wondering: why do we need to explicitly +initialize mutable static references but not immutable static references? After +all, when we wrote a spec for `answer_to_the_ultimate_question` earlier, we +managed to get away with not initializing the reference for `ANSWER` (which is +an immutable static item). The difference is that the value of a mutable static +item can change over the course of a program, and SAW requires that you be very +careful in specifying what a mutable static value is at the start of a +function. For example, consider a slightly extended version of the earlier Rust +code we saw: + +:::{literalinclude} code/statics.rs +:lines: 11-22 +:language: rust +::: + +Suppose someone were to ask you "what value does +`mut_answer_to_the_ultimate_question` return?" This is not a straightforward +question to answer, as the value that it returns depends on the surrounding +context. If you were to call `mut_answer_to_the_ultimate_question` right as the +program started, it would return `42`. If you were to call +`mut_answer_to_the_ultimate_question` as part of the implementation of +`alternate_universe`, however, then it would return `27`! This is an inherent +danger of using mutable static items, as they can be modified at any time by +any function. For this reason, SAW requires you to be explicit about what the +initial values of mutable static items should be. + +### Mutable static items and compositional overrides + +In the "Overrides and mutable references" section, we discussed the potential +pitfalls of using mutable references in compositional overrides. Mutable static +items are also mutable values that are backed by references, and as such, they +are also subject to the same pitfalls. Let's see an example of this: + +:::{literalinclude} code/statics-compositional.rs +:lines: 1-12 +:language: rust +::: + +The setup is almost the same, except that instead of passing a mutable +reference as an argument to `side_effect`, we instead declare a mutable static +item `A` that is shared between `side_effect` and `foo`. We could potentially +write SAW specs for `side_effect` and `foo` like these: + +:::{literalinclude} code/statics-compositional-fail.saw +:lines: 5-20 +:language: sawscript +::: + +Note that we have once again underspecified the behavior of `side_effect`, as +we do not say what `A`'s value should be in the postconditions of +`side_effect_spec`. Similarly, `foo_spec` is wrong, as it should return `0` +rather than the initial value of `A`. By similar reasoning as before, we run +the risk that using `side_effect_ov` could lead use to prove something +incorrect. Thankfully, SAW can also catch this sort of mistake: + +:::{code-block} console +[15:46:38.525] Verifying statics_compositional/16fea9c0::side_effect[0] ... +[15:46:38.533] Simulating statics_compositional/16fea9c0::side_effect[0] ... +[15:46:38.533] Checking proof obligations statics_compositional/16fea9c0::side_effect[0] ... +[15:46:38.533] Proof succeeded! statics_compositional/16fea9c0::side_effect[0] +[15:46:38.533] Verifying statics_compositional/16fea9c0::foo[0] ... +[15:46:38.542] Simulating statics_compositional/16fea9c0::foo[0] ... +[15:46:38.542] Matching 1 overrides of statics_compositional/16fea9c0::side_effect[0] ... +[15:46:38.542] Branching on 1 override variants of statics_compositional/16fea9c0::side_effect[0] ... +... +State of mutable static variable "statics_compositional/16fea9c0::A[0]" not described in postcondition +::: + +To repair this proof, add a `mir_points_to` statement in the postconditions of +`side_effect_spec`: + +:::{literalinclude} code/statics-compositional.saw +:lines: 7-13 +:language: sawscript +::: + +And then correct the behavior of `foo_spec`: + +:::{literalinclude} code/statics-compositional.saw +:lines: 15-22 +:language: sawscript +::: + +Be warned that if your program declares any mutable static items, then any +compositional override _must_ state what the value of each mutable static item +is in its postconditions. This applies _even if the override does not directly +use the mutable static items_. For example, if we had declared a second mutable +static item alongside `A`: + +:::{code-block} rust +static mut A: u32 = 42; +static mut B: u32 = 27; +::: + +Then `side_effect_spec` would need an additional `mir_points_to` statement +involving `B` to satisfy this requirement. This requirement is somewhat +heavy-handed, but it is necessary in general to avoid unsoundness. Think +carefully before you use mutable static items! diff --git a/doc/saw-lexer/pyproject.toml b/doc/saw-lexer/pyproject.toml new file mode 100644 index 0000000000..d4a437374d --- /dev/null +++ b/doc/saw-lexer/pyproject.toml @@ -0,0 +1,13 @@ +[build-system] +requires = [ + "setuptools", +] +build-backend = "setuptools.build_meta" + +[project] +name = "saw-lexer" +version = "0.0.1" +dependencies = ["pygments"] + +[project.entry-points."pygments.lexers"] +saw_lexer = "saw_lexer:SAWScriptLexer" diff --git a/doc/saw-lexer/src/saw_lexer.py b/doc/saw-lexer/src/saw_lexer.py new file mode 100644 index 0000000000..cf656d08b8 --- /dev/null +++ b/doc/saw-lexer/src/saw_lexer.py @@ -0,0 +1,76 @@ +from pygments.lexer import RegexLexer, bygroups, using +from pygments.lexers.haskell import CryptolLexer +from pygments.token import ( + Punctuation, + Literal, + Comment, + Operator, + Keyword, + Name, + String, + Number, + Whitespace, +) + + +class SAWScriptLexer(RegexLexer): + name = "SAWScript" + aliases = ["sawscript", "saw-script"] + filenames = ["*.saw"] + tokens = { + "root": [ + # Whitespace-like tokens + (r"\s+", Whitespace), + (r"//.*?$", Comment.Singleline), + (r"/\*", Comment.Multiline, "comment"), + # String literals + (r"\"", String.Double, "string"), + # Embedded Cryptol + (r"\{\{|\{\|", Literal, "cryptol"), + # Symbols + (r"<-|->|==>", Operator.Word), + (r"~|-|\*|\+|/|%|\^|#|>>|>=|>|<<|<=|<|==|!=|&&|&|\|\||\||@", Operator), + (r"=", Operator.Word), + (r",|;|\(|\)|::|:|\[|\]|\{|\}|\.|\\", Punctuation), + # Reserved words (Keywords, types, constants, builtins) + # These require at least a space after (so we don't eat part of some + # other identifier) + ( + r"(import|and|let|rec|in|do|if|then|else|as|hiding|typedef)(\s+)", + bygroups(Keyword.Reserved, Whitespace), + ), + ( + ( + r"(CryptolSetup|JavaSetup|LLVMSetup|MIRSetup|ProofScript|" + r"TopLevel|CrucibleSetup|Int|String|Term|Type|Bool|AIG|" + r"CFG|CrucibleMethodSpec|LLVMSpec|JVMMethodSpec|JVMSpec|" + r"MIRSpec)(\s+)" + ), + bygroups(Keyword.Type, Whitespace), + ), + (r"(true|false|abc|z3)(\s+)", bygroups(Keyword.Constant, Whitespace)), + # All other identifiers + (r"[a-zA-Z_][\w']*", Name), + # Number literals + (r"0[bB][0-1]+", Number.Bin), + (r"0[oO][0-7]+", Number.Oct), + (r"0[xX][0-9a-fA-F]", Number.Hex), + (r"\d+", Number.Integer), + ], + "comment": [ + (r"[^*/]", Comment.Multiline), + # Allows for arbitrary nesting + (r"/\*", Comment.Multiline, "#push"), + (r"\*/", Comment.Multiline, "#pop"), + (r"[*/]", Comment.Multiline), + ], + "string": [ + ('[^"]+', String.Double), + ('"', String.Double, "#pop"), + ], + "cryptol": [ + (r"[^|}]", using(CryptolLexer)), + (r"\|\}|\}\}", Literal, "#pop"), + (r"[|}]", using(CryptolLexer)), + ], + } diff --git a/doc/saw-user-manual/analyzing-hardware-circuits-using-yosys.md b/doc/saw-user-manual/analyzing-hardware-circuits-using-yosys.md new file mode 100644 index 0000000000..288197aae3 --- /dev/null +++ b/doc/saw-user-manual/analyzing-hardware-circuits-using-yosys.md @@ -0,0 +1,248 @@ +# Analyzing Hardware Circuits using Yosys + +SAW has experimental support for analysis of hardware descriptions written in VHDL ([via GHDL](https://github.com/ghdl/ghdl-yosys-plugin)) through an intermediate representation produced by [Yosys](https://yosyshq.net/yosys/). +This generally follows the same conventions and idioms used in the rest of SAWScript. + +## Processing VHDL With Yosys + +Given a VHDL file `test.vhd` containing an entity `test`, one can generate an intermediate representation `test.json` suitable for loading into SAW: + +:::{code-block} console +$ ghdl -a test.vhd +$ yosys +... +Yosys 0.10+1 (git sha1 7a7df9a3b4, gcc 10.3.0 -fPIC -Os) +yosys> ghdl test + +1. Executing GHDL. +Importing module test. + +yosys> write_json test.json + +2. Executing JSON backend. +::: + +It can sometimes be helpful to invoke additional Yosys passes between the `ghdl` and `write_json` commands. +For example, at present SAW does not support the `$pmux` cell type. +Yosys is able to convert `$pmux` cells into trees of `$mux` cells using the `pmuxtree` command. +We expect there are many other situations where Yosys' considerable library of commands is valuable for pre-processing. + +## Example: Ripple-Carry Adder + +Consider three VHDL entities. +First, a half-adder: + +:::{code-block} vhdl +library ieee; +use ieee.std_logic_1164.all; + +entity half is + port ( + a : in std_logic; + b : in std_logic; + c : out std_logic; + s : out std_logic + ); +end half; + +architecture halfarch of half is +begin + c <= a and b; + s <= a xor b; +end halfarch; +::: + +Next, a one-bit adder built atop that half-adder: + +:::{code-block} vhdl +library ieee; +use ieee.std_logic_1164.all; + +entity full is + port ( + a : in std_logic; + b : in std_logic; + cin : in std_logic; + cout : out std_logic; + s : out std_logic + ); +end full; + +architecture fullarch of full is + signal half0c : std_logic; + signal half0s : std_logic; + signal half1c : std_logic; +begin + half0 : entity work.half port map (a => a, b => b, c => half0c, s => half0s); + half1 : entity work.half port map (a => half0s, b => cin, c => half1c, s => s); + cout <= half0c or half1c; +end fullarch; +::: + +Finally, a four-bit adder: + +:::{code-block} vhdl +library ieee; +use ieee.std_logic_1164.all; + +entity add4 is + port ( + a : in std_logic_vector(0 to 3); + b : in std_logic_vector(0 to 3); + res : out std_logic_vector(0 to 3) + ); +end add4; + +architecture add4arch of add4 is + signal full0cout : std_logic; + signal full1cout : std_logic; + signal full2cout : std_logic; + signal ignore : std_logic; +begin + full0 : entity work.full port map (a => a(0), b => b(0), cin => '0', cout => full0cout, s => res(0)); + full1 : entity work.full port map (a => a(1), b => b(1), cin => full0cout, cout => full1cout, s => res(1)); + full2 : entity work.full port map (a => a(2), b => b(2), cin => full1cout, cout => full2cout, s => res(2)); + full3 : entity work.full port map (a => a(3), b => b(3), cin => full2cout, cout => ignore, s => res(3)); +end add4arch; +::: + +Using GHDL and Yosys, we can convert the VHDL source above into a format that SAW can import. +If all of the code above is in a file `adder.vhd`, we can run the following commands: + +:::{code-block} console +$ ghdl -a adder.vhd +$ yosys -p 'ghdl add4; write_json adder.json' +::: + +The produced file `adder.json` can then be loaded into SAW with `yosys_import`: + +:::{code-block} console +$ saw +... +sawscript> enable_experimental +sawscript> m <- yosys_import "adder.json" +sawscript> :type m +Term +sawscript> type m +[23:57:14.492] {add4 : {a : [4], b : [4]} -> {res : [4]}, + full : {a : [1], b : [1], cin : [1]} -> {cout : [1], s : [1]}, + half : {a : [1], b : [1]} -> {c : [1], s : [1]}} +::: + +`yosys_import` returns a `Term` with a Cryptol record type, where the fields correspond to each VHDL module. +We can access the fields of this record like we would any Cryptol record, and call the functions within like any Cryptol function. + +:::{code-block} console +sawscript> type {{ m.add4 }} +[00:00:25.255] {a : [4], b : [4]} -> {res : [4]} +sawscript> eval_int {{ (m.add4 { a = 1, b = 2 }).res }} +[00:02:07.329] 3 +::: + +We can also use all of SAW's infrastructure for asking solvers about `Term`s, such as the `sat` and `prove` commands. +For example: + +:::{code-block} console +sawscript> sat w4 {{ m.add4 === \_ -> { res = 5 } }} +[00:04:41.993] Sat: [_ = (5, 0)] +sawscript> prove z3 {{ m.add4 === \inp -> { res = inp.a + inp.b } }} +[00:05:43.659] Valid +sawscript> prove yices {{ m.add4 === \inp -> { res = inp.a - inp.b } }} +[00:05:56.171] Invalid: [_ = (8, 13)] +::: + +The full library of `ProofScript` tactics is available in this setting. +If necessary, proof tactics like `simplify` can be used to rewrite goals before querying a solver. + +Special support is provided for the common case of equivalence proofs between HDL modules and other `Term`s (e.g. Cryptol functions, other HDL modules, or "extracted" imperative LLVM or JVM code). +The command `yosys_verify` has an interface similar to `llvm_verify`: given a specification, some lemmas, and a proof tactic, it produces evidence of a proven equivalence that may be passed as a lemma to future calls of `yosys_verify`. +For example, consider the following Cryptol specifications for one-bit and four-bit adders: + +:::{code-block} cryptol +cryfull : {a : [1], b : [1], cin : [1]} -> {cout : [1], s : [1]} +cryfull inp = { cout = [cout], s = [s] } + where [cout, s] = zext inp.a + zext inp.b + zext inp.cin + +cryadd4 : {a : [4], b : [4]} -> {res : [4]} +cryadd4 inp = { res = inp.a + inp.b } +::: + +We can prove equivalence between `cryfull` and the VHDL `full` module: + +:::{code-block} console +sawscript> full_spec <- yosys_verify {{ m.full }} [] {{ cryfull }} [] w4; +::: + +The result `full_spec` can then be used as an "override" when proving equivalence between `cryadd4` and the VHDL `add4` module: + +:::{code-block} console +sawscript> add4_spec <- yosys_verify {{ m.add4 }} [] {{ cryadd4 }} [full_spec] w4; +::: + +The above could also be accomplished through the use of `prove_print` and term rewriting, but it is much more verbose. + +`yosys_verify` may also be given a list of preconditions under which the equivalence holds. +For example, consider the following Cryptol specification for `full` that ignores the `cin` bit: + +:::{code-block} cryptol +cryfullnocarry : {a : [1], b : [1], cin : [1]} -> {cout : [1], s : [1]} +cryfullnocarry inp = { cout = [cout], s = [s] } + where [cout, s] = zext inp.a + zext inp.b +::: + +This is not equivalent to `full` in general, but it is if constrained to inputs where `cin = 0`. +We may express that precondition like so: + +:::{code-block} console +sawscript> full_nocarry_spec <- yosys_verify {{ adderm.full }} [{{\(inp : {a : [1], b : [1], cin : [1]}) -> inp.cin == 0}}] {{ cryfullnocarry }} [] w4; +::: + +The resulting override `full_nocarry_spec` may still be used in the proof for `add4` (this is accomplished by rewriting to a conditional expression). + +## API Reference + +N.B: The following commands must first be enabled using `enable_experimental`. + +- `yosys_import : String -> TopLevel Term` produces a `Term` given the path to a JSON file produced by the Yosys `write_json` command. + The resulting term is a Cryptol record, where each field corresponds to one HDL module exported by Yosys. + Each HDL module is in turn represented by a function from a record of input port values to a record of output port values. + For example, consider a Yosys JSON file derived from the following VHDL entities: + + :::{code-block} vhdl + entity half is + port ( + a : in std_logic; + b : in std_logic; + c : out std_logic; + s : out std_logic + ); + end half; + + entity full is + port ( + a : in std_logic; + b : in std_logic; + cin : in std_logic; + cout : out std_logic; + s : out std_logic + ); + end full; + ::: + + The resulting `Term` will have the type: + + :::{code-block} cryptol + { half : {a : [1], b : [1]} -> {c : [1], s : [1]} + , full : {a : [1], b : [1], cin : [1]} -> {cout : [1], s : [1]} + } + ::: + +- `yosys_verify : Term -> [Term] -> Term -> [YosysTheorem] -> ProofScript () -> TopLevel YosysTheorem` proves equality between an HDL module and a specification. + The first parameter is the HDL module - given a record `m` from `yosys_import`, this will typically look something like `{{ m.foo }}`. + The second parameter is a list of preconditions for the equality. + The third parameter is the specification, a term of the same type as the HDL module, which will typically be some Cryptol function or another HDL module. + The fourth parameter is a list of "overrides", which witness the results of previous `yosys_verify` proofs. + These overrides can be used to simplify terms by replacing use sites of submodules with their specifications. + + Note that `Term`s derived from HDL modules are "first class", and are not restricted to `yosys_verify`: they may also be used with SAW's typical `Term` infrastructure like `sat`, `prove_print`, term rewriting, etc. + `yosys_verify` simply provides a convenient and familiar interface, similar to `llvm_verify` or `jvm_verify`. diff --git a/doc/saw-user-manual/bisimulation-prover.md b/doc/saw-user-manual/bisimulation-prover.md new file mode 100644 index 0000000000..e5d8c931b1 --- /dev/null +++ b/doc/saw-user-manual/bisimulation-prover.md @@ -0,0 +1,310 @@ +# Bisimulation Prover + +SAW contains a bisimulation prover to prove that two terms simulate each other. +This prover allows users to prove that two terms executing in lockstep satisfy +some relations over the state of each circuit and their outputs. This type of +proof is useful in demonstrating the eventual equivalence of two circuits, or +of a circuit and a functional specification. SAW enables these proofs with the +experimental `prove_bisim` command: + +:::{code-block} sawscript +prove_bisim : ProofScript () -> [BisimTheorem] -> Term -> Term -> Term -> Term -> TopLevel BisimTheorem +::: + +When invoking `prove_bisim strat theorems srel orel lhs rhs`, the arguments +represent the following: + +1. `strat`: A proof strategy to use during verification. +2. `theorems`: A list of already proven bisimulation theorems. +3. `srel`: A state relation between `lhs` and `rhs`. This relation must have + the type `lhsState -> rhsState -> Bit`. The relation's first argument is + `lhs`'s state prior to execution. The relation's second argument is `rhs`'s + state prior to execution. `srel` then returns a `Bit` indicating whether + the two arguments satisfy the bisimulation's state relation. +4. `orel`: An output relation between `lhs` and `rhs`. This relation must have + the type `(lhsState, output) -> (rhsState, output) -> Bit`. The relation's + first argument is a pair consisting of `lhs`'s state and output following + execution. The relation's second argument is a pair consisting of `rhs`'s + state and output following execution. `orel` then returns a `Bit` indicating + whether the two arguments satisfy the bisimulation's output relation. +5. `lhs`: A term that simulates `rhs`. `lhs` must have the type + `(lhsState, input) -> (lhsState, output)`. The first argument to `lhs` is a + tuple containing the internal state of `lhs`, as well as the input to `lhs`. + `lhs` returns a tuple containing its internal state after execution, as well + as its output. +6. `rhs`: A term that simulates `lhs`. `rhs` must have the type + `(rhsState, input) -> (rhsState, output)`. The first argument to `rhs` is a + tuple containing the internal state of `rhs`, as well as the input to `rhs`. + `rhs` returns a tuple containing its internal state after execution, as well + as its output. + +On success, `prove_bisim` returns a `BisimTheorem` that can be used in future +bisimulation proofs to enable compositional bisimulation proofs. On failure, +`prove_bisim` will abort. + +## Bisimulation Example + +This section walks through an example proving that the Cryptol implementation +of an AND gate that makes use of internal state and takes two cycles to +complete is equivalent to a pure function that computes the logical AND of its +inputs in one cycle. First, we define the implementation's state type: + +:::{code-block} cryptol +type andState = { loaded : Bit, origX : Bit, origY : Bit } +::: + +`andState` is a record type with three fields: + +1. `loaded`: A `Bit` indicating whether the input to the AND gate has been + loaded into the state record. +2. `origX`: A `Bit` storing the first input to the AND gate. +3. `origY`: A `Bit` storing the second input to the AND gate. + +Now, we define the AND gate's implementation: + +:::{code-block} cryptol +andImp : (andState, (Bit, Bit)) -> (andState, (Bit, Bit)) +andImp (s, (x, y)) = + if s.loaded /\ x == s.origX /\ y == s.origY + then (s, (True, s.origX && s.origY)) + else ({ loaded = True, origX = x, origY = y }, (False, 0)) +::: + +`andImp` takes a tuple as input where the first field is an `andState` holding +the gate's internal state, and second field is a tuple containing the inputs to +the AND gate. `andImp` returns a tuple consisting of the updated `andState` and +the gate's output. The output is a tuple where the first field is a ready bit +that is `1` when the second field is ready to be read, and the second field +is the result of gate's computation. + +`andImp` takes two cycles to complete: + +1. The first cycle loads the inputs into its state's `origX` and `origY` fields + and sets `loaded` to `True`. It sets both of its output bits to `0`. +2. The second cycle uses the stored input values to compute the logical AND. + It sets its ready bit to `1` and its second output to the logical AND + result. + +So long as the inputs remain fixed after the second cycle, `andImp`'s output +remains unchanged. If the inputs change, then `andImp` restarts the +computation (even if the inputs change between the first and second cycles). + +Next, we define the pure function we'd like to prove `andImp` bisimilar to: + +:::{code-block} cryptol +andSpec : ((), (Bit, Bit)) -> ((), (Bit, Bit)) +andSpec (_, (x, y)) = ((), (True, x && y)) +::: + +`andSpec` takes a tuple as input where the first field is `()`, indicating that +`andSpec` is a pure function without internal state, and the second field is a +tuple containing the inputs to the AND function. `andSpec` returns a tuple +consisting of `()` (again, because `andSpec` is stateless) and the function's +output. Like `andImp`, the output is a tuple where the first field is a ready +bit that is `1` when the second field is ready to be read, and the second field +is the result of the function's computation. + +`andSpec` completes in a single cycle, and as such its ready bit is always `1`. +It computes the logical AND directly on the function's inputs using Cryptol's +`(&&)` operator. + +Next, we define a state relation over `andImp` and `andSpec`: + +:::{code-block} cryptol +andStateRel : andState -> () -> Bit +andStateRel _ () = True +::: + +`andStateRel` takes two arguments: + +1. An `andState` for `andImp`. +2. An empty state (`()`) for `andSpec`. + +`andStateRel` returns a `Bit` indicating whether the relation is satisified. In +this case, `andStateRel` always returns `True` because `andSpec` is stateless +and therefore the state relation permits `andImp` to accept any state. + +Lastly, we define a relation over `andImp` and `andSpec`: + +:::{code-block} cryptol +andOutputRel : (andState, (Bit, Bit)) -> ((), (Bit, Bit)) -> Bit +andOutputRel (s, (impReady, impO)) ((), (_, specO)) = + if impReady then impO == specO else True +::: + +`andOutputRel` takes two arguments: + +1. A return value from `andImp`. Specifically, a pair consisting of an + `andState` and a pair containing a ready bit and result of the logical AND. +2. A return value from `andSpec`. Specifically, a pair consisting of an empty + state `()` and a pair containing a ready bit and result of the logical AND. + +`andOutputRel` returns a `Bit` indicating whether the relation is satisfied. It +considers the relation satisfied in two ways: + +1. If `andImp`'s ready bit is set, the relation is satisfied if the output + values `impO` and `specO` from `andImp` and `andSpec` respectively are + equivalent. +2. If `andImp`'s ready bit is not set, the relation is satisfied. + +Put another way, the relation is satisfied if the end result of `andImp` and +`andSpec` are equivalent. The relation permits intermediate outputs to differ. + +We can verify that this relation is always satisfied--and therefore the two +terms are bisimilar--by using `prove_bisim`: + +:::{code-block} sawscript +import "And.cry"; +enable_experimental; + +and_bisim <- prove_bisim z3 [] {{ andStateRel }} {{ andOutputRel }} {{ andImp }} {{ andSpec }}; +::: + +Upon running this script, SAW prints: + +:::{code-block} console +Successfully proved bisimulation between andImp and andSpec +::: + +### Building a NAND gate + +We can make the example more interesting by reusing components to build a NAND +gate. We first define a state type for the NAND gate implementation that +contains `andImp`'s state. This NAND gate will not need any additional state, +so we will define a type `nandState` that is equal to `andState`: + +:::{code-block} cryptol +type nandState = andState +::: + +Now, we define an implementation `nandImp` that calls `andImp` and negates the +result: + +:::{code-block} cryptol +nandImp : (nandState, (Bit, Bit)) -> (nandState, (Bit, Bit)) +nandImp x = (s, (andReady, ~andRes)) + where + (s, (andReady, andRes)) = andImp x +::: + +Note that `nandImp` is careful to preserve the ready status of `andImp`. +Because `nandImp` relies on `andImp`, it also takes two cycles to compute the +logical NAND of its inputs. + +Next, we define a specification `nandSpec` in terms of `andSpec`: + +:::{code-block} cryptol +nandSpec : ((), (Bit, Bit)) -> ((), (Bit, Bit)) +nandSpec (_, (x, y)) = ((), (True, ~ (andSpec ((), (x, y))).1.1)) +::: + +As with `andSpec`, `nandSpec` is pure and computes its result in a single +cycle. + +Next, we define a state relation over `nandImp` and `nandSpec`: + +:::{code-block} cryptol +nandStateRel : andState -> () -> Bit +nandStateRel _ () = True +::: + +As with `andStateRel`, this state relation is always `True` because `nandSpec` +is stateless. + +Lastly, we define an output relation indicating that `nandImp` and `nandSpec` +produce equivalent results once `nandImp`'s ready bit is `1`: + +:::{code-block} cryptol +nandOutputRel : (nandState, (Bit, Bit)) -> ((), (Bit, Bit)) -> Bit +nandOutputRel (s, (impReady, impO)) ((), (_, specO)) = + if impReady then impO == specO else True +::: + +To prove that `nandImp` and `nandSpec` are bisimilar, we again use +`prove_bisim`. This time however, we can reuse the bisimulation proof for the +AND gate by including it in the `theorems` paramter for `prove_bisim`: + +:::{code-block} sawscript +prove_bisim z3 [and_bisim] {{ nandStateRel }} {{ nandOutputRel }} {{ nandImp }} {{ nandSpec }}; +::: + +Upon running this script, SAW prints: + +:::{code-block} console +Successfully proved bisimulation between nandImp and nandSpec +::: + +## Understanding the proof goals + +While not necessary for simple proofs, more advanced proofs may require +inspecting proof goals. `prove_bisim` generates and attempts to solve the +following proof goals: + +:::{code-block} text +OUTPUT RELATION THEOREM: + forall s1 s2 in. + srel s1 s2 -> orel (lhs (s1, in)) (rhs (s2, in)) + +STATE RELATION THEOREM: + forall s1 s2 out1 out2. + orel (s1, out1) (s2, out2) -> srel s1 s2 +::: + +where the variables in the `forall`s are: + +- `s1`: Initial state for `lhs` +- `s2`: Initial state for `rhs` +- `in`: Input value to `lhs` and `rhs` +- `out1`: Initial output value for `lhs` +- `out2`: Initial output value for `rhs` + +The `STATE RELATION THEOREM` verifies that the output relation properly captures +the guarantees of the state relation. The `OUTPUT RELATION THEOREM` verifies +that if `lhs` and `rhs` are executed with related states, then the result of +that execution is also related. These two theorems together guarantee that the +terms simulate each other. + +When using composition, `prove_bisim` also generates and attempts to solve +the proof goal below for any successfully applied `BisimTheorem` in the +`theorems` list: + +:::{code-block} text +COMPOSITION SIDE CONDITION: + forall g_lhs_s g_rhs_s. + g_srel g_lhs_s g_rhs_s -> f_srel f_lhs_s f_rhs_s + where + f_lhs_s = extract_inner_state g_lhs g_lhs_s f_lhs + f_rhs_s = extract_inner_state g_rhs g_rhs_s f_rhs +::: + +where `g_lhs` is an outer term containing a call to an inner term `f_lhs` +represented by a `BisimTheorem` and `g_rhs` is an outer term containing a call +to an inner term `f_rhs` represented by the same `BisimTheorem`. The variables +in `COMPOSITION SIDE CONDITION` are: + +- `extract_inner_state x x_s y`: A helper function that takes an outer term `x`, an + outer state `x_s`, and an inner term `y`, and returns the inner state of `x_s` + that `x` passes to `y`. +- `g_lhs_s`: The state for `g_lhs` +- `g_rhs_s`: The state for `g_rhs` +- `g_srel`: The state relation for `g_lhs` and `g_rhs` +- `f_srel`: The state relation for `f_lhs` and `f_rhs` +- `f_lhs_s`: The state for `f_lhs`, as represented in `g_lhs_s` (extracted using + `extract_inner_state`). +- `f_rhs_s`: The state for `f_rhs`, as represented in `g_rhs_s` (extracted using + `extract_inner_state`). + +The `COMPOSITION SIDE CONDITION` exists to verify that the terms in the +bisimulation relation properly set up valid states for subterms they contain. + +## Limitations + +For now, the `prove_bisim` command has a couple limitations: + +- `lhs` and `rhs` must be named functions. This is because `prove_bisim` uses + these names to perform substitution when making use of compositionality. +- Each subterm present in the list of bisimulation theorems already + proven may be invoked at most once in `lhs` or `rhs`. That is, if some + function `g_lhs` calls `f_lhs`, and `prove_bisim` is invoked with a + `BisimTheorem` proving that `f_lhs` is bisimilar to `f_rhs`, then `g_lhs` may + call `f_lhs` at most once. diff --git a/doc/saw-user-manual/creating-symbolic-variables.md b/doc/saw-user-manual/creating-symbolic-variables.md new file mode 100644 index 0000000000..0abab7e099 --- /dev/null +++ b/doc/saw-user-manual/creating-symbolic-variables.md @@ -0,0 +1,124 @@ +# Creating Symbolic Variables + +The direct extraction process just discussed automatically introduces +symbolic variables and then abstracts over them, yielding a SAWScript +`Term` that reflects the semantics of the original Java, LLVM, or MIR code. +For simple functions, this is often the most convenient interface. For +more complex code, however, it can be necessary (or more natural) to +specifically introduce fresh variables and indicate what portions of the +program state they correspond to. + +- `fresh_symbolic : String -> Type -> TopLevel Term` is responsible for +creating new variables in this context. The first argument is a name +used for pretty-printing of terms and counter-examples. In many cases it +makes sense for this to be the same as the name used within SAWScript, +as in the following: + +:::{code-block} sawscript +x <- fresh_symbolic "x" ty; +::: + +However, using the same name is not required. + +The second argument to `fresh_symbolic` is the type of the fresh +variable. Ultimately, this will be a SAWCore type; however, it is usually +convenient to specify it using Cryptol syntax with the type quoting +brackets `{|` and `|}`. For example, creating a 32-bit integer, as +might be used to represent a Java `int` or an LLVM `i32`, can be done as +follows: + +:::{code-block} sawscript +x <- fresh_symbolic "x" {| [32] |}; +::: + +Although symbolic execution works best on symbolic variables, which are +"unbound" or "free", most of the proof infrastructure within SAW uses +variables that are _bound_ by an enclosing lambda expression. Given a +`Term` with free symbolic variables, we can construct a lambda term that +binds them in several ways. + +- `abstract_symbolic : Term -> Term` finds all symbolic variables in the +`Term` and constructs a lambda expression binding each one, in some +order. The result is a function of some number of arguments, one for +each symbolic variable. It is the simplest but least flexible way to +bind symbolic variables. + +:::{code-block} console +sawscript> x <- fresh_symbolic "x" {| [8] |} +sawscript> let t = {{ x + x }} +sawscript> print_term t +let { x@1 = Prelude.Vec 8 Prelude.Bool + } + in Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) + x + x +sawscript> let f = abstract_symbolic t +sawscript> print_term f +let { x@1 = Prelude.Vec 8 Prelude.Bool + } + in \(x : x@1) -> + Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) x x +::: + +If there are multiple symbolic variables in the `Term` passed to +`abstract_symbolic`, the ordering of parameters can be hard to predict. +In some cases (such as when a proof is the immediate next step, and it's +expected to succeed) the order isn't important. In others, it's nice to +have more control over the order. + +- `lambda : Term -> Term -> Term` is the building block for controlled +binding. It takes two terms: the one to transform, and the portion of +the term to abstract over. Generally, the first `Term` is one obtained +from `fresh_symbolic` and the second is a `Term` that would be passed to +`abstract_symbolic`. + +:::{code-block} console +sawscript> let f = lambda x t +sawscript> print_term f +let { x@1 = Prelude.Vec 8 Prelude.Bool + } + in \(x : x@1) -> + Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) x x +::: + +- `lambdas : [Term] -> Term -> Term` allows you to list the order in which +symbolic variables should be bound. Consider, for example, a `Term` +which adds two symbolic variables: + +:::{code-block} console +sawscript> x1 <- fresh_symbolic "x1" {| [8] |} +sawscript> x2 <- fresh_symbolic "x2" {| [8] |} +sawscript> let t = {{ x1 + x2 }} +sawscript> print_term t +let { x@1 = Prelude.Vec 8 Prelude.Bool + } + in Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) + x1 + x2 +::: + +We can turn `t` into a function that takes `x1` followed by `x2`: + +:::{code-block} console +sawscript> let f1 = lambdas [x1, x2] t +sawscript> print_term f1 +let { x@1 = Prelude.Vec 8 Prelude.Bool + } + in \(x1 : x@1) -> + \(x2 : x@1) -> + Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) x1 + x2 +::: + +Or we can turn `t` into a function that takes `x2` followed by `x1`: + +:::{code-block} console +sawscript> let f1 = lambdas [x2, x1] t +sawscript> print_term f1 +let { x@1 = Prelude.Vec 8 Prelude.Bool + } + in \(x2 : x@1) -> + \(x1 : x@1) -> + Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool (Cryptol.TCNum 8)) x1 + x2 +::: diff --git a/doc/saw-user-manual/cryptol-and-its-role-in-saw.md b/doc/saw-user-manual/cryptol-and-its-role-in-saw.md new file mode 100644 index 0000000000..43aab9df33 --- /dev/null +++ b/doc/saw-user-manual/cryptol-and-its-role-in-saw.md @@ -0,0 +1,262 @@ +# Cryptol and its Role in SAW + +Cryptol is a domain-specific language originally designed for the +high-level specification of cryptographic algorithms. It is general +enough, however, to describe a wide variety of programs, and is +particularly applicable to describing computations that operate on +streams of data of some fixed size. + +In addition to being integrated into SAW, Cryptol is a standalone +language with [its own +manual](http://cryptol.net/files/ProgrammingCryptol.pdf). + +SAW includes deep support for Cryptol, and in fact requires the use of +Cryptol for most non-trivial tasks. To fully understand the rest of +this manual and to effectively use SAW, you will need to develop at least +a rudimentary understanding of Cryptol. + +The primary use of Cryptol within SAWScript is to construct values of type +`Term`. Although `Term` values can be constructed from various sources, +inline Cryptol expressions are the most direct and convenient way to create +them. + +Specifically, a Cryptol expression can be placed inside double curly +braces (`{{` and `}}`), resulting in a value of type `Term`. As a very +simple example, there is no built-in integer addition operation in +SAWScript. However, we can use Cryptol's built-in integer addition operator +within SAWScript as follows: + +:::{code-block} console +sawscript> let t = {{ 0x22 + 0x33 }} +sawscript> print t +85 +sawscript> :type t +Term +::: + +Although it printed out in the same way as an `Int`, it is important to +note that `t` actually has type `Term`. We can see how this term is +represented internally, before being evaluated, with the `print_term` +function. + +:::{code-block} console +sawscript> print_term t +let { x@1 = Prelude.Vec 8 Prelude.Bool + x@2 = Cryptol.TCNum 8 + x@3 = Cryptol.PLiteralSeqBool x@2 + } + in Cryptol.ecPlus x@1 (Cryptol.PArithSeqBool x@2) + (Cryptol.ecNumber (Cryptol.TCNum 34) x@1 x@3) + (Cryptol.ecNumber (Cryptol.TCNum 51) x@1 x@3) +::: + +For the moment, it's not important to understand what this output means. +We show it only to clarify that `Term` values have their own internal +structure that goes beyond what exists in SAWScript. The internal +representation of `Term` values is in a language called SAWCore. The +full semantics of SAWCore are beyond the scope of this manual. + +The text constructed by `print_term` can also be accessed +programmatically (instead of printing to the screen) using the +`show_term` function, which returns a `String`. The `show_term` function +is not a command, so it executes directly and does not need `<-` to bind +its result. Therefore, the following will have the same result as the +`print_term` command above: + +:::{code-block} console +sawscript> let s = show_term t +sawscript> :type s +String +sawscript> print s + +::: + +Numbers are printed in decimal notation by default when printing terms, +but the following two commands can change that behavior. + +- `set_ascii : Bool -> TopLevel ()`, when passed `true`, makes +subsequent `print_term` or `show_term` commands print sequences of bytes +as ASCII strings (and doesn't affect printing of anything else). + +- `set_base : Int -> TopLevel ()` prints all bit vectors in the given +base, which can be between 2 and 36 (inclusive). + +A `Term` that represents an integer (any bit vector, as affected by +`set_base`) can be translated into a SAWScript `Int` using the +`eval_int : Term -> Int` function. This function returns an +`Int` if the `Term` can be represented as one, and fails at runtime +otherwise. + +:::{code-block} console +sawscript> print (eval_int t) +85 +sawscript> print (eval_int {{ True }}) + +"eval_int" (:1:1): +eval_int: argument is not a finite bitvector +sawscript> print (eval_int {{ [True] }}) +1 +::: + +Similarly, values of type `Bit` in Cryptol can be translated into values +of type `Bool` in SAWScript using the `eval_bool : Term -> Bool` function: + +:::{code-block} console +sawscript> let b = {{ True }} +sawscript> print_term b +Prelude.True +sawscript> print (eval_bool b) +true +::: + +Anything with sequence type in Cryptol can be translated into a list of +`Term` values in SAWScript using the `eval_list : Term -> [Term]` function. + +:::{code-block} console +sawscript> let l = {{ [0x01, 0x02, 0x03] }} +sawscript> print_term l +let { x@1 = Prelude.Vec 8 Prelude.Bool + x@2 = Cryptol.PLiteralSeqBool (Cryptol.TCNum 8) + } + in [Cryptol.ecNumber (Cryptol.TCNum 1) x@1 x@2 + ,Cryptol.ecNumber (Cryptol.TCNum 2) x@1 x@2 + ,Cryptol.ecNumber (Cryptol.TCNum 3) x@1 x@2] +sawscript> print (eval_list l) +[Cryptol.ecNumber (Cryptol.TCNum 1) (Prelude.Vec 8 Prelude.Bool) + (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) +,Cryptol.ecNumber (Cryptol.TCNum 2) (Prelude.Vec 8 Prelude.Bool) + (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) +,Cryptol.ecNumber (Cryptol.TCNum 3) (Prelude.Vec 8 Prelude.Bool) + (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8))] +::: + +Finally, a list of `Term` values in SAWScript can be collapsed into a single +`Term` with sequence type using the `list_term : [Term] -> Term` function, +which is the inverse of `eval_list`. + +:::{code-block} console +sawscript> let ts = eval_list l +sawscript> let l = list_term ts +sawscript> print_term l +let { x@1 = Prelude.Vec 8 Prelude.Bool + x@2 = Cryptol.PLiteralSeqBool (Cryptol.TCNum 8) + } + in [Cryptol.ecNumber (Cryptol.TCNum 1) x@1 x@2 + ,Cryptol.ecNumber (Cryptol.TCNum 2) x@1 x@2 + ,Cryptol.ecNumber (Cryptol.TCNum 3) x@1 x@2] +::: + +In addition to being able to extract integer and Boolean values from +Cryptol expressions, `Term` values can be injected into Cryptol +expressions. When SAWScript evaluates a Cryptol expression between `{{` +and `}}` delimiters, it does so with several extra bindings in scope: + +- Any variable in scope that has SAWScript type `Bool` is visible in + Cryptol expressions as a value of type `Bit`. + +- Any variable in scope that has SAWScript type `Int` is visible in + Cryptol expressions as a _type variable_. Type variables can be + demoted to numeric bit vector values using the backtick (`` ` ``) + operator. + +- Any variable in scope that has SAWScript type `Term` is visible in + Cryptol expressions as a value with the Cryptol type corresponding to + the internal type of the term. The power of this conversion is that + the `Term` does not need to have originally been derived from a + Cryptol expression. + +In addition to these rules, bindings created at the Cryptol level, +either from included files or inside Cryptol quoting brackets, are +visible only to later Cryptol expressions, and not as SAWScript +variables. + +To make these rules more concrete, consider the following examples. If +we bind a SAWScript `Int`, we can use it as a Cryptol type variable. If +we create a `Term` variable that internally has function type, we can +apply it to an argument within a Cryptol expression, but not at the +SAWScript level: + +:::{code-block} console +sawscript> let n = 8 +sawscript> :type n +Int +sawscript> let {{ f (x : [n]) = x + 1 }} +sawscript> :type {{ f }} +Term +sawscript> :type f + +:1:1-1:2: unbound variable: "f" (:1:1-1:2) +sawscript> print {{ f 2 }} +3 +::: + +If `f` was a binding of a SAWScript variable to a `Term` of function +type, we would get a different error: + +:::{code-block} console +sawscript> let f = {{ \(x : [n]) -> x + 1 }} +sawscript> :type {{ f }} +Term +sawscript> :type f +Term +sawscript> print {{ f 2 }} +3 +sawscript> print (f 2) + +type mismatch: Int -> t.0 and Term + at "_" (REPL) + mismatched type constructors: (->) and Term +::: + +One subtlety of dealing with `Term`s constructed from Cryptol is that +because the Cryptol expressions themselves are type checked by the +Cryptol type checker, and because they may make use of other `Term` +values already in scope, they are not type checked until the Cryptol +brackets are evaluated. So type errors at the Cryptol level may occur at +runtime from the SAWScript perspective (though they occur before the +Cryptol expressions are run). + +So far, we have talked about using Cryptol _value_ expressions. However, +SAWScript can also work with Cryptol _types_. The most direct way to +refer to a Cryptol type is to use type brackets: `{|` and `|}`. Any +Cryptol type written between these brackets becomes a `Type` value in +SAWScript. Some types in Cryptol are _numeric_ (also known as _size_) +types, and correspond to non-negative integers. These can be translated +into SAWScript integers with the `eval_size` function. For example: + +:::{code-block} console +sawscript> let {{ type n = 16 }} +sawscript> eval_size {| n |} +16 +sawscript> eval_size {| 16 |} +16 +::: + +For non-numeric types, `eval_size` fails at runtime: + +:::{code-block} console +sawscript> eval_size {| [16] |} + +"eval_size" (:1:1): +eval_size: not a numeric type +::: + +In addition to the use of brackets to write Cryptol expressions inline, +several built-in functions can extract `Term` values from Cryptol files +in other ways. The `import` command at the top level imports all +top-level definitions from a Cryptol file and places them in scope +within later bracketed expressions. This includes [Cryptol `foreign` +declarations](https://galoisinc.github.io/cryptol/master/FFI.html). If a +[Cryptol implementation of a foreign +function](https://galoisinc.github.io/cryptol/master/FFI.html#cryptol-implementation-of-foreign-functions) +is present, then it will be used as the definition when reasoning about +the function. Otherwise, the function will be imported as an opaque +constant with no definition. + +The `cryptol_load` command behaves similarly, but returns a +`CryptolModule` instead. If any `CryptolModule` is in scope, its +contents are available qualified with the name of the `CryptolModule` +variable. A specific definition can be explicitly extracted from a +`CryptolModule` using the `cryptol_extract` command: + +- `cryptol_extract : CryptolModule -> String -> TopLevel Term` diff --git a/doc/saw-user-manual/direct-extraction.md b/doc/saw-user-manual/direct-extraction.md new file mode 100644 index 0000000000..1a97abe053 --- /dev/null +++ b/doc/saw-user-manual/direct-extraction.md @@ -0,0 +1,36 @@ +# Direct Extraction + +In the case of the `max` function described earlier, the relevant inputs +and outputs are immediately apparent. The function takes two integer +arguments, always uses both of them, and returns a single integer value, +making no other changes to the program state. + +In cases like this, a direct translation is possible, given only an +identification of which code to execute. Two functions exist to handle +such simple code. The first, for LLVM is the more stable of the two: + +- `llvm_extract : LLVMModule -> String -> TopLevel Term` + +A similar function exists for Java, but is more experimental. + +- `jvm_extract : JavaClass -> String -> TopLevel Term` + +Because of its lack of maturity, it (and later Java-related commands) +must be enabled by running the `enable_experimental` command beforehand. + +- `enable_experimental : TopLevel ()` + +The structure of these two extraction functions is essentially +identical. The first argument describes where to look for code (in +either a Java class or an LLVM module, loaded as described in the +previous section). The second argument is the name of the method or +function to extract. + +When the extraction functions complete, they return a `Term` +corresponding to the value returned by the function or method as a +function of its arguments. + +These functions currently work only for code that takes some fixed +number of integral parameters, returns an integral result, and does not +access any dynamically-allocated memory (although temporary memory +allocated during execution is allowed). diff --git a/doc/saw-user-manual/extraction-to-the-coq-theorem-prover.md b/doc/saw-user-manual/extraction-to-the-coq-theorem-prover.md new file mode 100644 index 0000000000..f9f944be82 --- /dev/null +++ b/doc/saw-user-manual/extraction-to-the-coq-theorem-prover.md @@ -0,0 +1,217 @@ +# Extraction to the Coq theorem prover + +In addition to the (semi-)automatic and compositional proof modes +already discussed above, SAW has experimental support for exporting +Cryptol and SAWCore values as terms to the Coq proof assistant[^5]. +This is intended to support more manual proof efforts for properties +that go beyond what SAW can support (for example, proofs requiring +induction) or for connecting to preexisting formalizations in Coq +of useful algorithms (e.g. the fiat crypto library[^6]). + +This support consists of two related pieces. The first piece is a library of +formalizations of the primitives underlying Cryptol and SAWCore and +various supporting concepts that help bridge the conceptual gap +between SAW and Coq. The second piece is a term translation +that maps the syntactic forms of SAWCore onto corresponding +concepts in Coq syntax, designed to dovetail with the concepts +defined in the support library. SAWCore is a quite similar language +to the core calculus underlying Coq, so much of this translation is +quite straightforward; however, the languages are not exactly +equivalent, and there are some tricky cases that mostly arise from +Cryptol code that can only be partially supported. We will note +these restrictions later in the manual. + +We expect this extraction process to work with a fairly wide range of Coq +versions, as we are not using bleeding-edge Coq features. It has been +most fully tested with Coq version 8.13.2. + +## Support Library + +In order to make use of SAW's extraction capabilities, one must first +compile the support library using Coq so that the included definitions +and theorems can be referenced by the extracted code. From the top +directory of the SAW source tree, the source code for this support +library can be found in the `saw-core-coq/coq` subdirectory. +In this subdirectory you will find a `_CoqProject` and a `Makefile`. +A simple `make` invocation should be enough to compile +all the necessary files, assuming Coq is installed and `coqc` is +available in the user's `PATH`. HTML documentation for the support +library can also be generated by `make html` from the same directory. + +Once the library is compiled, the recommended way to import +it into your subsequent development is by adding the following +lines to your `_CoqProject` file: + +:::{code-block} text +-Q /saw-core-coq/coq/generated/CryptolToCoq CryptolToCoq +-Q /saw-core-coq/coq/handwritten/CryptolToCoq CryptolToCoq +::: + +Here `` refers to the location on your system where the +SAWScript source tree is checked out. This will add the relevant +library files to the `CryptolToCoq` namespace, where the extraction +process will expect to find them. + +The support library for extraction is broken into two parts: those +files which are handwritten, versus those that are automatically +generated. The handwritten files are generally fairly readable and +are reasonable for human inspection; they define most of the +interesting pipe-fitting that allows Cryptol and SAWCore definitions +to connect to corresponding Coq concepts. In particular the +file `SAWCoreScaffolding.v` file defines most of the bindings of base +types to Coq types, and the `SAWCoreVectorsAsCoqVectors.v` defines the +core bitvector operations. The automatically generated files are direct +translations of the SAWCore source files +(`saw-core/prelude/Prelude.sawcore` and +`cryptol-saw-core/saw/Cryptol.sawcore`) that correspond to the +standard libraries for SAWCore and Cryptol, respectively. + +The autogenerated files are intended to be kept up-to-date with +changes in the corresponding `sawcore` files, and end users should +not need to generate them. Nonetheless, if they are out of sync for some +reason, these files may be regenerated using the `saw` executable +by running `(cd saw-core-coq; saw saw/generate_scaffolding.saw)` +from the top-level of the SAW source directory before compiling them +with Coq as described above. + +You may also note some additional files and concepts in the standard +library, such as `CompM.v`, and a variety of lemmas and definitions +related to it. These definitions are related to the "heapster" system, +which form a separate use-case for the SAWCore to Coq translation. +These definitions will not be used for code extracted from Cryptol. + +## Cryptol module extraction + +There are several modes of use for the SAW to Coq term extraction +facility, but the easiest to use is whole Cryptol module extraction. +This will extract all the definitions in the given Cryptol module, +together with it's transitive dependencies, into a single Coq module +which can then be compiled and pulled into subsequence developments. + +Suppose we have a Cryptol source file named `source.cry` and we want +to generate a Coq file named `output.v`. We can accomplish this by +running the following commands in saw (either directly from the `saw` +command prompt, or via a script file) + +:::{code-block} sawscript +enable_experimental; +write_coq_cryptol_module "source.cry" "output.v" [] []; +::: + +In this default mode, identifiers in the Cryptol source will be +directly translated into identifiers in Coq. This may occasionally +cause problems if source identifiers clash with Coq keywords or +preexisting definitions. The third argument to +`write_coq_cryptol_module` can be used to remap such names if +necessary by giving a list of `(in,out)` pairs of names. The fourth +argument is a list of source identifiers to skip translating, if +desired. Authoritative online documentation for this command can be +obtained directly from the `saw` executable via `:help +write_coq_cryptol_module` after `enable_experimental`. + +The resulting "output.v" file will have some of the usual hallmarks of +computer-generated code; it will have poor formatting and, explicit +parenthesis and fully-qualified names. Thankfully, once loaded into +Coq, the Coq pretty-printer will do a much better job of rendering +these terms in a somewhat human-readable way. + +## Proofs involving uninterpreted functions + +It is possible to write a Cryptol module that references uninterpreted +functions by using the `primitive` keyword to declare them in your +Cryptol source. Primitive Cryptol declarations will be translated into +Coq section variables; as usual in Coq, uses of these section +variables will be translated into additional parameters to the +definitions from outside the section. In this way, consumers of +the translated module can instantiate the declared Cryptol functions +with corresponding terms in subsequent Coq developments. + +Although the Cryptol interpreter itself will not be able to compute +with declared but undefined functions of this sort, they can be used +both to provide specifications for functions to be verified with +`llvm_verify` or `jvm_verify` and also for Coq extraction. + +For example, if I write the following Cryptol source file: + +:::{code-block} cryptol +primitive f : Integer -> Integer + +g : Integer -> Bool +g x = f (f x) > 0 +::: + +After extraction, the generated term `g` will have Coq type: + +:::{code-block} coq +(Integer -> Integer) -> Integer -> Bool +::: + +## Translation limitations and caveats + +Translation from Cryptol to Coq has a number of fundamental +limitations that must be addressed. The most severe of these is that +Cryptol is a fully general-recursive language, and may exhibit runtime +errors directly via calls to the `error` primitive, or via partial +operations (such as indexing a sequence out-of-bounds). The internal +language of Coq, by contrast, is a strongly-normalizing language of +total functions. As such, our translation is unable to extract all +Cryptol programs. + +### Recursive programs + +The most severe of the currently limitations for our system +is that the translation is unable to translate any recursive Cryptol +program. Doing this would require us to attempt to find some +termination argument for the recursive function sufficient to satisfy +Coq; for now, no attempt is made to do so. if you attempt to +extract a recursive function, SAW will produce an error +about a "malformed term" with `Prelude.fix` as the head symbol. + +Certain limited kinds of recursion are available via the +`foldl` Cryptol primitive operation, which is translated directly +into a fold operation in Coq. This is sufficient for many basic +iterative algorithms. + +### Type coercions + +Another limitation of the translation system is that Cryptol uses SMT +solvers during its typechecking process and uses the results of solver +proofs to justify some of its typing judgments. When extracting these +terms to Coq, type equality coercions must be generated. Currently, we +do not have a good way to transport the reasoning done inside +Cryptol's typechecker into Coq, so we just supply a very simple `Ltac` +tactic to discharge these coercions (see `solveUnsafeAssert` in +`CryptolPrimitivesForSAWCoreExtra.v`). This tactic is able to +discover simple coercions, but for anything nontrivial it may +fail. The result will be a typechecking failure when compiling the +generated code in Coq when the tactic fails. If you encounter this +problem, it may be possible to enhance the `solveUnsafeAssert` tactic +to cover your use case. + +### Error terms + +A final caveat that is worth mentioning is that Cryptol can sometimes +produce runtime errors. These can arise from explicit calls to the +`error` primitive, or from partially defined operations (e.g., +division by zero or sequence access out of bounds). Such instances +are translated to occurrences of an unrealized Coq axiom named `error`. +In order to avoid introducing an inconsistent environment, the `error` +axiom is restricted to apply only to inhabited types. All the types +arising from Cryptol programs are inhabited, so this is no problem +in principle. However, collecting and passing this information around +on the Coq side is a little tricky. + +The main technical device we use here is the `Inhabited` type class; +it simply asserts that a type has some distinguished inhabitant. We +provide instances for the base types and type constructors arising +from Cryptol, so the necessary instances ought to be automatically +constructed when needed. However, polymorphic Cryptol functions add +some complications, as type arguments must also come together with +evidence that they are inhabited. The translation process takes care +to add the necessary `Inhabited` arguments, so everything ought to +work out. However, if Coq typechecking of generated code fails with +errors about `Inhabited` class instances, it likely represents some +problem with this aspect of the translation. + +[^5]: +[^6]: diff --git a/doc/saw-user-manual/formal-deprecation-process.md b/doc/saw-user-manual/formal-deprecation-process.md new file mode 100644 index 0000000000..c778b49b1d --- /dev/null +++ b/doc/saw-user-manual/formal-deprecation-process.md @@ -0,0 +1,38 @@ +# Formal Deprecation Process + +SAW primitives, and thus their associated SAWScript built-ins, sometimes +become obsolete or are found inadequate and replaced. +The process by which that happens has three steps, as follows: + +1. The decision is made to deprecate and eventually remove the objects +in question. +This can happen at the level of individual built-in elements (for example, +when replacing a function with an awkward interface or unfortunate name) or +at the level of internal units of functionality with possibly multiple +built-ins affected. +At this step the built-ins in question are marked for a deprecation warning. +They remain available by default, but referring to them will trigger a +warning. + +2. The objects in question are made invisible by default. +Now, referring to the affected built-ins will fail unless the +`enable_deprecated` command is used. +In that case referring to them will still produce a warning. + +3. The objects in question are removed entirely and are no longer +available. + +In general any object or group of objects will move only one step per +release; that is, something first marked deprecated (so it warns) in +saw-script 1.2 will not disappear by default before saw-script 1.3 and +not be removed entirely before saw-script 1.4. +The time frame may be longer depending on the needs of downstream +users, the complexity of migration, and the cost/impact of keeping the +deprecated code in the system. + +We may move faster if circumstances dictate, but hope not to need to. + +Objects that have never appeared in a release, or that have never +moved past experimental may be removed without first being deprecated. +However, we aim to avoid this in cases where the objects in question +have gotten substantial use despite their formal status. diff --git a/doc/saw-user-manual/index.md b/doc/saw-user-manual/index.md new file mode 100644 index 0000000000..e868c0816c --- /dev/null +++ b/doc/saw-user-manual/index.md @@ -0,0 +1,36 @@ +# SAW User Manual + +:::{warning} +The SAW User Manual is currently being restructured/rewritten in large chunks. + +It is likely that you will encounter typos, omissions, and otherwise incorrect +or incomplete information about SAW and SAWScript's features and operation. + +Your help identifying these [documentation +issues](https://github.com/GaloisInc/saw-script/issues?q=is%3Aissue%20state%3Aopen%20label%3Adocumentation) +is deeply appreciated, as it helps us prioritize and resolve them more quickly. +Thanks for helping to make SAW better for everyone! +::: + +```{toctree} +:maxdepth: 2 +:caption: Contents + +introduction +invoking-saw +structure-of-sawscript +the-term-type +cryptol-and-its-role-in-saw +transforming-term-values +proofs-about-terms +symbolic-execution +symbolic-termination +loading-code +direct-extraction +creating-symbolic-variables +specification-based-verification +extraction-to-the-coq-theorem-prover +analyzing-hardware-circuits-using-yosys +bisimulation-prover +formal-deprecation-process +``` diff --git a/doc/saw-user-manual/introduction.md b/doc/saw-user-manual/introduction.md new file mode 100644 index 0000000000..c1ded8eb5e --- /dev/null +++ b/doc/saw-user-manual/introduction.md @@ -0,0 +1,32 @@ +# Introduction + +The Software Analysis Workbench (SAW) is a tool for constructing +mathematical models of the computational behavior of software, +transforming these models, and proving properties about them. + +SAW can currently construct models of a subset of programs written in +Cryptol, LLVM (and therefore C), and JVM (and therefore Java). SAW also has +experimental, incomplete support for MIR (and therefore Rust). The +models take the form of typed functional programs, so in a sense SAW can +be considered a translator from imperative programs to their functional +equivalents. Various external proof tools, including a variety of SAT +and SMT solvers, can be used to prove properties about the functional +models. SAW can construct models from arbitrary Cryptol programs, and +from C and Java programs that have fixed-size inputs and outputs and +that terminate after a fixed number of iterations of any loop (or a +fixed number of recursive calls). One common use case is to verify that +an algorithm specification in Cryptol is equivalent to an algorithm +implementation in C or Java. + +The process of extracting models from programs, manipulating them, +forming queries about them, and sending them to external provers is +orchestrated using a special purpose language called SAWScript. +SAWScript is a typed functional language with support for sequencing of +imperative commands. + +The rest of this document first describes how to use the SAW tool, +`saw`, and outlines the structure of the SAWScript language and its +relationship to Cryptol. It then presents the SAWScript commands that +transform functional models and prove properties about them. Finally, it +describes the specific commands available for constructing models from +imperative programs. diff --git a/doc/saw-user-manual/invoking-saw.md b/doc/saw-user-manual/invoking-saw.md new file mode 100644 index 0000000000..1f6e5880a7 --- /dev/null +++ b/doc/saw-user-manual/invoking-saw.md @@ -0,0 +1,96 @@ +# Invoking SAW + +The primary mechanism for interacting with SAW is through the `saw` +executable included as part of the standard binary distribution. With no +arguments, `saw` starts a read-evaluate-print loop (REPL) that allows +the user to interactively evaluate commands in the SAWScript language. +With one file name argument, it executes the specified file as a SAWScript +program. + +In addition to a file name, the `saw` executable accepts several +command-line options: + +`-h, -?, --help` + + ~ Print a help message. + +`-V, --version` + + ~ Show the version of the SAWScript interpreter. + +`-c path, --classpath=path` + + ~ Specify a colon-delimited list of paths to search for Java classes. + +`-i path, --import-path=path` + + ~ Specify a colon-delimited list of paths to search for imports. + +`-t, --extra-type-checking` + + ~ Perform extra type checking of intermediate values. + +`-I, --interactive` + + ~ Run interactively (with a REPL). This is the default if no other + arguments are specified. + +`-j path, --jars=path` + + ~ Specify a colon-delimited list of paths to `.jar` files to search + for Java classes. + +`-b path, --java-bin-dirs` + + ~ Specify a colon-delimited list of paths to search for a Java + executable. + +`-d num, --sim-verbose=num` + + ~ Set the verbosity level of the Java and LLVM simulators. + +`-v num, --verbose=num` + + ~ Set the verbosity level of the SAWScript interpreter. + +`--clean-mismatched-versions-solver-cache[=path]` + + ~ Run the `clean_mismatched_versions_solver_cache` command on the solver + cache at the given path, or if no path is given, the solver cache at the + value of the `SAW_SOLVER_CACHE_PATH` environment variable, then exit. See + the section **Caching Solver Results** for a description of the + `clean_mismatched_versions_solver_cache` command and the solver caching + feature in general. + +SAW also uses several environment variables for configuration: + +`CRYPTOLPATH` + + ~ Specify a colon-delimited list of directory paths to search for Cryptol + imports (including the Cryptol prelude). + +`PATH` + + ~ If the `--java-bin-dirs` option is not set, then the `PATH` will be + searched to find a Java executable. + +`SAW_IMPORT_PATH` + + ~ Specify a colon-delimited list of directory paths to search for imports. + +`SAW_JDK_JAR` + + ~ Specify the path of the `.jar` file containing the core Java + libraries. Note that that is not necessary if the `--java-bin-dirs` option + or the `PATH` environment variable is used, as SAW can use this information + to determine the location of the core Java libraries' `.jar` file. + +`SAW_SOLVER_CACHE_PATH` + + ~ Specify a path at which to keep a cache of solver results obtained during + calls to certain tactics. A cache is not created at this path until it is + needed. See the section **Caching Solver Results** for more detail about this + feature. + +On Windows, semicolon-delimited lists are used instead of colon-delimited +lists. diff --git a/doc/saw-user-manual/loading-code.md b/doc/saw-user-manual/loading-code.md new file mode 100644 index 0000000000..117731aefe --- /dev/null +++ b/doc/saw-user-manual/loading-code.md @@ -0,0 +1,254 @@ +# Loading Code + +The first step in analyzing any code is to load it into the system. + +## Loading LLVM + +To load LLVM code, simply provide the location of a valid bitcode file +to the `llvm_load_module` function. + +- `llvm_load_module : String -> TopLevel LLVMModule` + +The resulting `LLVMModule` can be passed into the various functions +described below to perform analysis of specific LLVM functions. + +The LLVM bitcode parser should generally work with LLVM versions between +3.5 and 16.0, though it may be incomplete for some versions. Debug +metadata has changed somewhat throughout that version range, so is the +most likely case of incompleteness. We aim to support every version +after 3.5, however, so report any parsing failures as [on +GitHub](https://github.com/GaloisInc/saw-script/issues). + +## Loading Java + +Loading Java code is slightly more complex, because of the more +structured nature of Java packages. First, when running `saw`, three flags +control where to look for classes: + +- The `-b` flag takes the path where the `java` executable lives, which is used + to locate the Java standard library classes and add them to the class + database. Alternatively, one can put the directory where `java` lives on the + `PATH`, which SAW will search if `-b` is not set. + +- The `-j` flag takes the name of a JAR file as an argument and adds the + contents of that file to the class database. + +- The `-c` flag takes the name of a directory as an argument and adds all class + files found in that directory (and its subdirectories) to the class database. + By default, the current directory is included in the class path. + +Most Java programs will only require setting the `-b` flag (or the `PATH`), as +that is enough to bring in the standard Java libraries. Note that when +searching the `PATH`, SAW makes assumptions about where the standard library +classes live. These assumptions are likely to hold on JDK 7 or later, but they +may not hold on older JDKs on certain operating systems. If you are using an +old version of the JDK and SAW is unable to find a standard Java class, you may +need to specify the location of the standard classes' JAR file with the `-j` +flag (or, alternatively, with the `SAW_JDK_JAR` environment variable). + +Once the class path is configured, you can pass the name of a class to +the `java_load_class` function. + +- `java_load_class : String -> TopLevel JavaClass` + +The resulting `JavaClass` can be passed into the various functions +described below to perform analysis of specific Java methods. + +Java class files from any JDK newer than version 6 should work. However, +support for JDK 9 and later is experimental. Verifying code that only uses +primitive data types is known to work well, but there are some as-of-yet +unresolved issues in verifying code involving classes such as `String`. For +more information on these issues, refer to +[this GitHub issue](https://github.com/GaloisInc/crucible/issues/641). + +(loading-mir)= +## Loading MIR + +To load a piece of Rust code, first compile it to a MIR JSON file, as described +in [this section](#compiling-mir), and then provide the location of the JSON +file to the `mir_load_module` function: + +- `mir_load_module : String -> TopLevel MIRModule` + +SAW currently supports Rust code that can be built with a [January 23, 2023 +Rust nightly](https://static.rust-lang.org/dist/2023-01-23/). If you encounter +a Rust feature that SAW does not support, please report it [on +GitHub](https://github.com/GaloisInc/saw-script/issues). + +## Notes on Compiling Code for SAW + +SAW will generally be able to load arbitrary LLVM bitcode, JVM bytecode, and +MIR JSON files, but several guidelines can help make verification easier or +more likely to succeed. + +### Compiling LLVM + +For generating LLVM with `clang`, it can be helpful to: + +- Turn on debugging symbols with `-g` so that SAW can find source + locations of functions, names of variables, etc. + +- Optimize with `-O1` so that the generated bitcode more closely matches + the C/C++ source, making the results more comprehensible. + +- Use `-fno-threadsafe-statics` to prevent `clang` from emitting + unnecessary pthread code. + +- Link all relevant bitcode with `llvm-link` (including, _e.g._, the C++ + standard library when analyzing C++ code). + +All SAW proofs include side conditions to rule out undefined behavior, +and proofs will only succeed if all of these side conditions have been +discharged. However the default SAW notion of undefined behavior is with +respect to the semantics of LLVM, rather than C or C++. If you want to +rule out undefined behavior according to the C or C++ standards, +consider compiling your code with `-fsanitize=undefined` or one of the +related flags[^1] to `clang`. + +Generally, you'll also want to use `-fsanitize-trap=undefined`, or one +of the related flags, to cause the compiled code to use `llvm.trap` to +indicate the presence of undefined behavior. Otherwise, the compiled +code will call a separate function, such as +`__ubsan_handle_shift_out_of_bounds`, for each type of undefined +behavior, and SAW currently does not have built in support for these +functions (though you could manually create overrides for them in a +verification script). + +[^1]: + +### Compiling Java + +For Java, the only compilation flag that tends to be valuable is `-g` to +retain information about the names of function arguments and local +variables. + +(compiling-mir)= +### Compiling MIR + +In order to verify Rust code, SAW analyzes Rust's MIR (mid-level intermediate +representation) language. In particular, SAW analyzes a particular form of MIR +that the [`mir-json`](https://github.com/GaloisInc/mir-json) tool produces. You +will need to intall `mir-json` and run it on Rust code in order to produce MIR +JSON files that SAW can load (see [this section](#loading-mir)). You will also +need to use `mir-json` to build custom versions of the Rust standard libraries +that are more suited to verification purposes. + +If you are working from a checkout of the `saw-script` repo, you can install +the `mir-json` tool and the custom Rust standard libraries by performing the +following steps: + +1. Clone the [`crucible`](https://github.com/GaloisInc/crucible) and `mir-json` + submodules like so: + + :::{code-block} console + $ git submodule update deps/crucible deps/mir-json + ::: + +2. Navigate to the `mir-json` submodule: + + :::{code-block} console + $ cd deps/mir-json + ::: + +3. Follow the instructions laid out in the [`mir-json` installation + instructions](https://github.com/GaloisInc/mir-json#installation-instructions) + in order to install `mir-json`. + +4. Navigate to the + [`crux-mir`](https://github.com/GaloisInc/crucible/tree/master/crux-mir) + subdirectory of the `crucible` submodule: + + :::{code-block} console + $ cd ../crucible/crux-mir/ + ::: + +5. Run the `translate_libs.sh` script: + + :::{code-block} console + $ ./translate_libs.sh + ::: + + This will compile the custom versions of the Rust standard libraries using + `mir-json`, placing the results under the `rlibs` subdirectory. + +6. Finally, define a `SAW_RUST_LIBRARY_PATH` environment variable that points + to the newly created `rlibs` subdirectory: + + :::{code-block} console + $ export SAW_RUST_LIBRARY_PATH=<...>/crucible/crux-mir/rlibs + ::: + +For `cargo`-based projects, `mir-json` provides a `cargo` subcommand called +`cargo saw-build` that builds a JSON file suitable for use with SAW. `cargo +saw-build` integrates directly with `cargo`, so you can pass flags to it like +any other `cargo` subcommand. For example: + +:::{code-block} console +# Make sure that SAW_RUST_LIBRARY_PATH is defined, as described above +$ cargo saw-build + +linking 11 mir files into <...>/example-364cf2df365c7055.linked-mir.json + +::: + +Note that: + +- The full output of `cargo saw-build` here is omitted. The important part is + the `.linked-mir.json` file that appears after `linking X mir files into`, as + that is the JSON file that must be loaded with SAW. +- `SAW_RUST_LIBRARY_PATH` should point to the the MIR JSON files for the Rust + standard library. + +`mir-json` also supports compiling individual `.rs` files through `mir-json`'s +`saw-rustc` command. As the name suggests, it accepts all of the flags that +`rustc` accepts. For example: + +:::{code-block} console +# Make sure that SAW_RUST_LIBRARY_PATH is defined, as described above +$ saw-rustc example.rs + +linking 11 mir files into <...>/example.linked-mir.json + +::: + +## Notes on C++ Analysis + +The distance between C++ code and LLVM is greater than between C and +LLVM, so some additional considerations come into play when analyzing +C++ code with SAW. + +The first key issue is that the C++ standard library is large and +complex, and tends to be widely used by C++ applications. To analyze +most C++ code, it will be necessary to link your code with a version of +the `libc++` library[^2] compiled to LLVM bitcode. The `wllvm` program +can[^3] be useful for this. + +The C++ standard library includes a number of key global variables, and +any code that touches them will require that they be initialized using +`llvm_alloc_global`. + +Many C++ names are slightly awkward to deal with in SAW. They may be +mangled relative to the text that appears in the C++ source code. SAW +currently only understands the mangled names. The `llvm-nm` program can +be used to show the list of symbols in an LLVM bitcode file, and the +`c++filt` program can be used to demangle them, which can help in +identifying the symbol you want to refer to. In addition, C++ names from +namespaces can sometimes include quote marks in their LLVM encoding. For +example: + +:::{code-block} llvm +%"class.quux::Foo" = type { i32, i32 } +::: + +This can be mentioned in SAW by saying: + +:::{code-block} sawscript +llvm_type "%\"class.quux::Foo\"" +::: + +Finally, there is no support for calling constructors in specifications, +so you will need to construct objects piece-by-piece using, _e.g._, +`llvm_alloc` and `llvm_points_to`. + +[^2]: +[^3]: diff --git a/doc/saw-user-manual/proofs-about-terms.md b/doc/saw-user-manual/proofs-about-terms.md new file mode 100644 index 0000000000..ee244368f4 --- /dev/null +++ b/doc/saw-user-manual/proofs-about-terms.md @@ -0,0 +1,435 @@ +# Proofs about Terms + +The goal of SAW is to facilitate proofs about the behavior of +programs. It may be useful to prove some small fact to use as a rewrite +rule in later proofs, but ultimately these rewrite rules come together +into a proof of some higher-level property about a software system. + +Whether proving small lemmas (in the form of rewrite rules) or a +top-level theorem, the process builds on the idea of a _proof script_ +that is run by one of the top level proof commands. + +- `prove_print : ProofScript () -> Term -> TopLevel Theorem` +takes a proof script (which we'll describe next) and a `Term`. The +`Term` should be of function type with a return value of `Bool` (`Bit` +at the Cryptol level). It will then use the proof script to attempt to +show that the `Term` returns `True` for all possible inputs. If it is +successful, it will print `Valid` and return a `Theorem`. If not, it +will abort. + +- `sat_print : ProofScript () -> Term -> TopLevel ()` is similar +except that it looks for a _single_ value for which the `Term` evaluates +to `True` and prints out that value, returning nothing. + +- `prove_core : ProofScript () -> String -> TopLevel Theorem` +proves and returns a `Theorem` from a string in SAWCore syntax. + +## Automated Tactics + +The simplest proof scripts just specify the automated prover to use. +The `ProofScript` values `abc` and `z3` select the ABC and Z3 theorem +provers, respectively, and are typically good choices. + +For example, combining `prove_print` with `abc`: + +:::{code-block} console +sawscript> t <- prove_print abc {{ \(x:[8]) -> x+x == x*2 }} +Valid +sawscript> t +Theorem (let { x@1 = Prelude.Vec 8 Prelude.Bool + x@2 = Cryptol.TCNum 8 + x@3 = Cryptol.PArithSeqBool x@2 + } + in (x : x@1) +-> Prelude.EqTrue + (Cryptol.ecEq x@1 (Cryptol.PCmpSeqBool x@2) + (Cryptol.ecPlus x@1 x@3 x x) + (Cryptol.ecMul x@1 x@3 x + (Cryptol.ecNumber (Cryptol.TCNum 2) x@1 + (Cryptol.PLiteralSeqBool x@2))))) +::: + +Similarly, `sat_print` will show that the function returns `True` for +one specific input (which it should, since we already know it returns +`True` for all inputs): + +:::{code-block} console +sawscript> sat_print abc {{ \(x:[8]) -> x+x == x*2 }} +Sat: [x = 0] +::: + +In addition to these, the `bitwuzla`, `boolector`, `cvc4`, `cvc5`, `mathsat`, +and `yices` provers are available. The internal decision procedure `rme`, short +for Reed-Muller Expansion, is an automated prover that works particularly well +on the Galois field operations that show up, for example, in AES. + +In more complex cases, some pre-processing can be helpful or necessary +before handing the problem off to an automated prover. The +pre-processing can involve rewriting, beta reduction, unfolding, the use +of provers that require slightly more configuration, or the use of +provers that do very little real work. + +## Proof Script Diagnostics + +During development of a proof, it can be useful to print various +information about the current goal. The following tactics are useful in +that context. + +- `print_goal : ProofScript ()` prints the entire goal in SAWCore +syntax. + +- `print_goal_consts : ProofScript ()` prints a list of unfoldable constants +in the current goal. + +- `print_goal_depth : Int -> ProofScript ()` takes an integer argument, `n`, +and prints the goal up to depth `n`. Any elided subterms are printed +with a `...` notation. + +- `print_goal_size : ProofScript ()` prints the number of nodes in the +DAG representation of the goal. + +## Rewriting in Proof Scripts + +One of the key techniques available for completing proofs in SAWScript +is the use of rewriting or transformation. The following commands +support this approach. + +- `simplify : Simpset -> ProofScript ()` works just like `rewrite`, +except that it works in a `ProofScript` context and implicitly +transforms the current (unnamed) goal rather than taking a `Term` as a +parameter. + +- `goal_eval : ProofScript ()` will evaluate the current proof goal to a +first-order combination of primitives. + +- `goal_eval_unint : [String] -> ProofScript ()` works like `goal_eval` +but avoids expanding or simplifying the given names. + +## Other Transformations + +Some useful transformations are not easily specified using equality +statements, and instead have special tactics. + +- `beta_reduce_goal : ProofScript ()` works like `beta_reduce_term` but +on the current goal. It takes any sub-expression of the form `(\x -> t) +v` and replaces it with a transformed version of `t` in which all +instances of `x` are replaced by `v`. + +- `unfolding : [String] -> ProofScript ()` works like `unfold_term` but +on the current goal. + +Using `unfolding` is mostly valuable for proofs +based entirely on rewriting, since the default behavior for automated +provers is to unfold everything before sending a goal to a prover. +However, with some provers it is possible to indicate that specific +named subterms should be represented as uninterpreted functions. + +- `unint_bitwuzla : [String] -> ProofScript ()` + +- `unint_cvc4 : [String] -> ProofScript ()` + +- `unint_cvc5 : [String] -> ProofScript ()` + +- `unint_yices : [String] -> ProofScript ()` + +- `unint_z3 : [String] -> ProofScript ()` + +The list of `String` arguments in these cases indicates the names of the +subterms to leave folded, and therefore present as uninterpreted +functions to the prover. To determine which folded constants appear in a +goal, use the `print_goal_consts` function described above. + +Ultimately, we plan to implement a more generic tactic that leaves +certain constants uninterpreted in whatever prover is ultimately used +(provided that uninterpreted functions are expressible in the prover). + +Note that each of the `unint_*` tactics have variants that are prefixed +with `sbv_` and `w4_`. The `sbv_`-prefixed tactics make use of the SBV +library to represent and solve SMT queries: + +- `sbv_unint_bitwuzla : [String] -> ProofScript ()` + +- `sbv_unint_cvc4 : [String] -> ProofScript ()` + +- `sbv_unint_cvc5 : [String] -> ProofScript ()` + +- `sbv_unint_yices : [String] -> ProofScript ()` + +- `sbv_unint_z3 : [String] -> ProofScript ()` + +The `w4_`-prefixed tactics make use of the What4 library instead of SBV: + +- `w4_unint_bitwuzla : [String] -> ProofScript ()` + +- `w4_unint_cvc4 : [String] -> ProofScript ()` + +- `w4_unint_cvc5 : [String] -> ProofScript ()` + +- `w4_unint_yices : [String] -> ProofScript ()` + +- `w4_unint_z3 : [String] -> ProofScript ()` + +In most specifications, the choice of SBV versus What4 is not important, as +both libraries are broadly compatible in terms of functionality. There are some +situations where one library may outpeform the other, however, due to +differences in how each library represents certain SMT queries. There are also +some experimental features that are only supported with What4 at the moment, +such as `enable_lax_loads_and_stores`. + +## Caching Solver Results + +SAW has the capability to cache the results of tactics which call out to +automated provers. This can save a considerable amount of time in cases such as +proof development and CI, where the same proof scripts are often run repeatedly +without changes. + +This caching is available for all tactics which call out to automated provers +at runtime: `abc`, `boolector`, `cvc4`, `cvc5`, `mathsat`, `yices`, `z3`, +`rme`, and the family of `unint` tactics described in the previous section. + +When solver caching is enabled and one of the tactics mentioned above is +encountered, if there is already an entry in the cache corresponding to the +call then the cached result is used, otherwise the appropriate solver is +queried, and the result saved to the cache. Entries are indexed by a SHA256 +hash of the exact query to the solver (ignoring variable names), any options +passed to the solver, and the names and full version strings of all the solver +backends involved (e.g. ABC and SBV for the `abc` tactic). This ensures cached +results are only used when they would be identical to the result of actually +running the tactic. + +The simplest way to enable solver caching is to set the environment variable +`SAW_SOLVER_CACHE_PATH`. With this environment variable set, `saw` and +`saw-remote-api` will automatically keep an [LMDB](http://www.lmdb.tech/doc/) +database at the given path containing the solver result cache. Setting this +environment variable globally therefore creates a global, concurrency-safe +solver result cache used by all newly created `saw` or `saw-remote-api` +processes. Note that when this environment variable is set, SAW does not create +a cache at the specified path until it is actually needed. + +There are also a number of SAW commands related to solver caching. + +- `set_solver_cache_path` is like setting `SAW_SOLVER_CACHE_PATH` for the + remainder of the current session, but opens an LMDB database at the specified + path immediately. If a cache is already in use in the current session + (i.e. through a prior call to `set_solver_cache_path` or through + `SAW_SOLVER_CACHE_PATH` being set and the cache being used at least once) + then all entries in the cache already in use will be copied to the new cache + being opened. + +- `clean_mismatched_versions_solver_cache` will remove all entries in the + solver result cache which were created using solver backend versions which do + not match the versions in the current environment. This can be run after an + update to clear out any old, unusable entries from the solver cache. This + command can also be run directly from the command line through the + `--clean-mismatched-versions-solver-cache` command-line option. + +- `print_solver_cache` prints to the console all entries in the cache whose + SHA256 hash keys start with the given hex string. Providing an empty string + results in all entries in the cache being printed. + +- `print_solver_cache_stats` prints to the console statistics including the + size of the solver cache, where on disk it is stored, and some counts of how + often it has been used during the current session. + +For performing more complicated database operations on the set of cached +results, the file `solver_cache.py` is provided with the Python bindings of the +SAW Remote API. This file implements a general-purpose Python interface for +interacting with the LMDB databases kept by SAW for solver caching. + +Below is an example of using solver caching with `saw -v Debug`. Only the +relevant output is shown, the rest abbreviated with "...". + +:::{code-block} console +sawscript> set_solver_cache_path "example.cache" +sawscript> prove_print z3 {{ \(x:[8]) -> x+x == x*2 }} +[22:13:00.832] Caching result: d1f5a76e7a0b7c01 (SBV 9.2, Z3 4.8.7 - 64 bit) +... +sawscript> prove_print z3 {{ \(new:[8]) -> new+new == new*2 }} +[22:13:04.122] Using cached result: d1f5a76e7a0b7c01 (SBV 9.2, Z3 4.8.7 - 64 bit) +... +sawscript> prove_print (w4_unint_z3_using "qfnia" []) \ + {{ \(x:[8]) -> x+x == x*2 }} +[22:13:09.484] Caching result: 4ee451f8429c2dfe (What4 v1.3-29-g6c462cd using qfnia, Z3 4.8.7 - 64 bit) +... +sawscript> print_solver_cache "d1f5a76e7a0b7c01" +[22:13:13.250] SHA: d1f5a76e7a0b7c01bdfe7d0e1be82b4f233a805ae85a287d45933ed12a54d3eb +[22:13:13.250] - Result: unsat +[22:13:13.250] - Solver: "SBV->Z3" +[22:13:13.250] - Versions: SBV 9.2, Z3 4.8.7 - 64 bit +[22:13:13.250] - Last used: 2023-07-25 22:13:04.120351 UTC + +sawscript> print_solver_cache "4ee451f8429c2dfe" +[22:13:16.727] SHA: 4ee451f8429c2dfefecb6216162bd33cf053f8e66a3b41833193529449ef5752 +[22:13:16.727] - Result: unsat +[22:13:16.727] - Solver: "W4 ->z3" +[22:13:16.727] - Versions: What4 v1.3-29-g6c462cd using qfnia, Z3 4.8.7 - 64 bit +[22:13:16.727] - Last used: 2023-07-25 22:13:09.484464 UTC + +sawscript> print_solver_cache_stats +[22:13:20.585] == Solver result cache statistics == +[22:13:20.585] - 2 results cached in example.cache +[22:13:20.585] - 2 insertions into the cache so far this run (0 failed attempts) +[22:13:20.585] - 1 usage of cached results so far this run (0 failed attempts) +::: + +## Other External Provers + +In addition to the built-in automated provers already discussed, SAW +supports more generic interfaces to other arbitrary theorem provers +supporting specific interfaces. + +- `external_aig_solver : String -> [String] -> ProofScript ()` +supports theorem provers that can take input as a single-output AIGER +file. The first argument is the name of the executable to run. The +second argument is the list of command-line parameters to pass to that +executable. Any element of this list equal to `"%f"` will be replaced +with the name of the temporary AIGER file generated for the proof goal. +The output from the solver is expected to be in DIMACS solution format. + +- `external_cnf_solver : String -> [String] -> ProofScript ()` +works similarly but for SAT solvers that take input in DIMACS CNF format +and produce output in DIMACS solution format. + +## Offline Provers + +For provers that must be invoked in more complex ways, or to defer proof +until a later time, there are functions to write the current goal to a +file in various formats, and then assume that the goal is valid through +the rest of the script. + +- `offline_aig : String -> ProofScript ()` + +- `offline_cnf : String -> ProofScript ()` + +- `offline_extcore : String -> ProofScript ()` + +- `offline_smtlib2 : String -> ProofScript ()` + +- `offline_unint_smtlib2 : [String] -> String -> ProofScript ()` + +These support the AIGER, DIMACS CNF, shared SAWCore, and SMT-Lib v2 +formats, respectively. The shared representation for SAWCore is +described [in the `saw-script` +repository](https://github.com/GaloisInc/saw-script/blob/master/doc/extcore.md). +The `offline_unint_smtlib2` command represents the folded subterms +listed in its first argument as uninterpreted functions. + +(finishing-proofs-without-external-solvers)= +## Finishing Proofs without External Solvers + +Some proofs can be completed using unsound placeholders, or using +techniques that do not require significant computation. + +- `assume_unsat : ProofScript ()` indicates that the current goal +should be assumed to be unsatisfiable. This is an alias for +`assume_valid`. Users should prefer to use `admit` instead. + +- `assume_valid : ProofScript ()` indicates that the current +goal should be assumed to be valid. Users should prefer to +use `admit` instead + +- `admit : String -> ProofScript ()` indicates that the current +goal should be assumed to be valid without proof. The given +string should be used to record why the user has decided to +assume this proof goal. + +- `quickcheck : Int -> ProofScript ()` runs the goal on the given +number of random inputs, and succeeds if the result of evaluation is +always `True`. This is unsound, but can be helpful during proof +development, or as a way to provide some evidence for the validity of a +specification believed to be true but difficult or infeasible to prove. + +- `trivial : ProofScript ()` states that the current goal should +be trivially true. This tactic recognizes instances of equality +that can be demonstrated by conversion alone. In particular +it is able to prove `EqTrue x` goals where `x` reduces to +the constant value `True`. It fails if this is not the case. + +## Multiple Goals + +The proof scripts shown so far all have a single implicit goal. As in +many other interactive provers, however, SAWScript proofs can have +multiple goals. The following commands can introduce or work with +multiple goals. These are experimental and can be used only after +`enable_experimental` has been called. + +- `goal_apply : Theorem -> ProofScript ()` will apply a given +introduction rule to the current goal. This will result in zero or more +new subgoals. + +- `goal_assume : ProofScript Theorem` will convert the first hypothesis +in the current proof goal into a local `Theorem` + +- `goal_insert : Theorem -> ProofScript ()` will insert a given +`Theorem` as a new hypothesis in the current proof goal. + +- `goal_intro : String -> ProofScript Term` will introduce a quantified +variable in the current proof goal, returning the variable as a `Term`. + +- `goal_when : String -> ProofScript () -> ProofScript ()` will run the +given proof script only when the goal name contains the given string. + +- `goal_exact : Term -> ProofScript ()` will attempt to use the given +term as an exact proof for the current goal. This tactic will succeed +whever the type of the given term exactly matches the current goal, +and will fail otherwise. + +- `split_goal : ProofScript ()` will split a goal of the form +`Prelude.and prop1 prop2` into two separate goals `prop1` and `prop2`. + +## Proof Failure and Satisfying Assignments + +The `prove_print` and `sat_print` commands print out their essential +results (potentially returning a `Theorem` in the case of +`prove_print`). In some cases, though, one may want to act +programmatically on the result of a proof rather than displaying it. + +The `prove` and `sat` commands allow this sort of programmatic analysis +of proof results. To allow this, they use two types we haven't mentioned +yet: `ProofResult` and `SatResult`. These are different from the other +types in SAWScript because they encode the possibility of two outcomes. +In the case of `ProofResult`, a statement may be valid or there may be a +counter-example. In the case of `SatResult`, there may be a satisfying +assignment or the statement may be unsatisfiable. + +- `prove : ProofScript SatResult -> Term -> TopLevel ProofResult` + +- `sat : ProofScript SatResult -> Term -> TopLevel SatResult` + +To operate on these new types, SAWScript includes a pair of functions: + +- `caseProofResult : {b} ProofResult -> b -> (Term -> b) -> b` takes a +`ProofResult`, a value to return in the case that the statement is +valid, and a function to run on the counter-example, if there is one. + +- `caseSatResult : {b} SatResult -> b -> (Term -> b) -> b` has the same +shape: it returns its first argument if the result represents an +unsatisfiable statement, or its second argument applied to a satisfying +assignment if it finds one. + +## AIG Values and Proofs + +Most SAWScript programs operate on `Term` values, and in most cases this +is the appropriate representation. It is possible, however, to represent +the same function that a `Term` may represent using a different data +structure: an And-Inverter-Graph (AIG). An AIG is a representation of a +Boolean function as a circuit composed entirely of AND gates and +inverters. Hardware synthesis and verification tools, including the ABC +tool that SAW has built in, can do efficient verification and +particularly equivalence checking on AIGs. + +To take advantage of this capability, a handful of built-in commands can +operate on AIGs. + +- `bitblast : Term -> TopLevel AIG` represents a `Term` as an `AIG` by +"blasting" all of its primitive operations (things like bit-vector +addition) down to the level of individual bits. + +- `load_aig : String -> TopLevel AIG` loads an `AIG` from an external +AIGER file. + +- `save_aig : String -> AIG -> TopLevel ()` saves an `AIG` to an +external AIGER file. + +- `save_aig_as_cnf : String -> AIG -> TopLevel ()` writes an `AIG` out +in CNF format for input into a standard SAT solver. diff --git a/doc/saw-user-manual/specification-based-verification.md b/doc/saw-user-manual/specification-based-verification.md new file mode 100644 index 0000000000..0f029c25a7 --- /dev/null +++ b/doc/saw-user-manual/specification-based-verification.md @@ -0,0 +1,2127 @@ +# Specification-Based Verification + +The built-in functions described so far work by extracting models of +code that can then be used for a variety of purposes, including proofs +about the properties of the code. + +When the goal is to prove equivalence between some LLVM, Java, or MIR code and +a specification, however, a more declarative approach is sometimes +convenient. The following sections describe an approach that combines +model extraction and verification with respect to a specification. A +verified specification can then be used as input to future +verifications, allowing the proof process to be decomposed. + +## Running a Verification + +Verification of LLVM is controlled by the `llvm_verify` command. + +:::{code-block} sawscript +llvm_verify : + LLVMModule -> + String -> + [CrucibleMethodSpec] -> + Bool -> + LLVMSetup () -> + ProofScript SatResult -> + TopLevel CrucibleMethodSpec +::: + +The first two arguments specify the module and function name to verify, +as with `llvm_verify`. The third argument specifies the list of +already-verified specifications to use for compositional verification +(described later; use `[]` for now). The fourth argument specifies +whether to do path satisfiability checking, and the fifth gives the +specification of the function to be verified. Finally, the last argument +gives the proof script to use for verification. The result is a proved +specification that can be used to simplify verification of functions +that call this one. + +Similar commands are available for JVM programs: + +:::{code-block} sawscript +jvm_verify : + JavaClass -> + String -> + [JVMMethodSpec] -> + Bool -> + JVMSetup () -> + ProofScript SatResult -> + TopLevel JVMMethodSpec +::: + +And for MIR programs: + +:::{code-block} sawscript +mir_verify : + MIRModule -> + String -> + [MIRSpec] -> + Bool -> + MIRSetup () -> + ProofScript () -> + TopLevel MIRSpec +::: + +### Running a MIR-based verification + +(Note: API functions involving MIR verification require `enable_experimental` +in order to be used. As such, some parts of this API may change before being +finalized.) + +The `String` supplied as an argument to `mir_verify` is expected to be a +function _identifier_. An identifier is expected adhere to one of the following +conventions: + +- `/::` +- `::` + +Where: + +- `` is the name of the crate in which the function is defined. (If + you produced your MIR JSON file by compiling a single `.rs` file with + `saw-rustc`, then the crate name is the same as the name of the file, but + without the `.rs` file extension.) +- `` is a hash of the crate and its dependencies. In extreme + cases, it is possible for two different crates to have identical crate names, + in which case the disambiguator must be used to distinguish between the two + crates. In the common case, however, most crate names will correspond to + exactly one disambiguator, and you are allowed to leave out the + `/` part of the `String` in this case. If you supply an + identifier with an ambiguous crate name and omit the disambiguator, then SAW + will raise an error. +- `` is the path to the function within the crate. Sometimes, + this is as simple as the function name itself. In other cases, a function + path may involve multiple _segments_, depending on the module hierarchy for + the program being verified. For instance, a `read` function located in + `core/src/ptr/mod.rs` will have the identifier: + + :::{code-block} text + core::ptr::read + ::: + + Where `core` is the crate name and `ptr::read` is the function path, which + has two segments `ptr` and `read`. There are also some special forms of + segments that appear for functions defined in certain language constructs. + For instance, if a function is defined in an `impl` block, then it will have + `{impl}` as one of its segments, e.g., + + :::{code-block} text + core::ptr::const_ptr::{impl}::offset + ::: + + If you are in doubt about what the full identifier for a given function is, + consult the MIR JSON file for your program. + +----- + +Now we describe how to construct a value of type `LLVMSetup ()`, `JVMSetup ()`, +or `MIRSetup ()`. + +## Structure of a Specification + +A specifications for Crucible consists of three logical components: + +- A specification of the initial state before execution of the function. + +- A description of how to call the function within that state. + +- A specification of the expected final value of the program state. + +These three portions of the specification are written in sequence within a `do` +block of type `{LLVM,JVM,MIR}Setup`. The command `{llvm,jvm,mir}_execute_func` +separates the specification of the initial state from the specification of the +final state, and specifies the arguments to the function in terms of the +initial state. Most of the commands available for state description will work +either before or after `{llvm,jvm,mir}_execute_func`, though with slightly +different meaning, as described below. + +## Creating Fresh Variables + +In any case where you want to prove a property of a function for an entire +class of inputs (perhaps all inputs) rather than concrete values, the initial +values of at least some elements of the program state must contain fresh +variables. These are created in a specification with the +`{llvm,jvm,mir}_fresh_var` commands rather than `fresh_symbolic`. + +- `llvm_fresh_var : String -> LLVMType -> LLVMSetup Term` + +- `jvm_fresh_var : String -> JavaType -> JVMSetup Term` + +- `mir_fresh_var : String -> MIRType -> MIRSetup Term` + +The first parameter to both functions is a name, used only for +presentation. It's possible (though not recommended) to create multiple +variables with the same name, but SAW will distinguish between them +internally. The second parameter is the LLVM, Java, or MIR type of the +variable. The resulting `Term` can be used in various subsequent +commands. + +Note that the second parameter to `{llvm,jvm,mir}_fresh_var` must be a type +that has a counterpart in Cryptol. (For more information on this, refer to the +"Cryptol type correspondence" section.) If the type does not have a Cryptol +counterpart, the function will raise an error. If you do need to create a fresh +value of a type that cannot be represented in Cryptol, consider using a +function such as `llvm_fresh_expanded_val` (for LLVM verification) or +`mir_fresh_expanded_value` (for MIR verification). + +LLVM types are built with this set of functions: + +- `llvm_int : Int -> LLVMType` +- `llvm_alias : String -> LLVMType` +- `llvm_array : Int -> LLVMType -> LLVMType` +- `llvm_float : LLVMType` +- `llvm_double : LLVMType` +- `llvm_packed_struct : [LLVMType] -> LLVMType` +- `llvm_struct_type : [LLVMType] -> LLVMType` + +Java types are built up using the following functions: + +- `java_bool : JavaType` +- `java_byte : JavaType` +- `java_char : JavaType` +- `java_short : JavaType` +- `java_int : JavaType` +- `java_long : JavaType` +- `java_float : JavaType` +- `java_double : JavaType` +- `java_class : String -> JavaType` +- `java_array : Int -> JavaType -> JavaType` + +MIR types are built up using the following functions: + +- `mir_adt : MIRAdt -> MIRType` +- `mir_array : Int -> MIRType -> MIRType` +- `mir_bool : MIRType` +- `mir_char : MIRType` +- `mir_i8 : MIRType` +- `mir_i6 : MIRType` +- `mir_i32 : MIRType` +- `mir_i64 : MIRType` +- `mir_i128 : MIRType` +- `mir_isize : MIRType` +- `mir_f32 : MIRType` +- `mir_f64 : MIRType` +- `mir_lifetime : MIRType` +- `mir_ref : MIRType -> MIRType` +- `mir_ref_mut : MIRType -> MIRType` +- `mir_slice : MIRType -> MIRType` +- `mir_str : MIRType` +- `mir_tuple : [MIRType] -> MIRType` +- `mir_u8 : MIRType` +- `mir_u6 : MIRType` +- `mir_u32 : MIRType` +- `mir_u64 : MIRType` +- `mir_u128 : MIRType` +- `mir_usize : MIRType` + +Most of these types are straightforward mappings to the standard LLVM +and Java types. The one key difference is that arrays must have a fixed, +concrete size. Therefore, all analysis results are valid only under the +assumption that any arrays have the specific size indicated, and may not +hold for other sizes. + +The `llvm_int` function takes an `Int` parameter indicating the variable's bit +width. For example, the C `uint16_t` and `int16_t` types correspond to +`llvm_int 16`. The C `bool` type is slightly trickier. A bare `bool` type +typically corresponds to `llvm_int 1`, but if a `bool` is a member of a +composite type such as a pointer, array, or struct, then it corresponds to +`llvm_int 8`. This is due to a peculiarity in the way Clang compiles `bool` +down to LLVM. When in doubt about how a `bool` is represented, check the LLVM +bitcode by compiling your code with `clang -S -emit-llvm`. + +LLVM types can also be specified in LLVM syntax directly by using the +`llvm_type` function. + +- `llvm_type : String -> LLVMType` + +For example, `llvm_type "i32"` yields the same result as `llvm_int 32`. + +The most common use for creating fresh variables is to state that a +particular function should have the specified behaviour for arbitrary +initial values of the variables in question. Sometimes, however, it can +be useful to specify that a function returns (or stores, more about this +later) an arbitrary value, without specifying what that value should be. +To express such a pattern, you can also run `llvm_fresh_var` from +the post state (i.e., after `llvm_execute_func`). + +## The SetupValue and JVMValue Types + +Many specifications require reasoning about both pure values and about +the configuration of the heap. The `SetupValue` type corresponds to +values that can occur during symbolic execution, which includes both +`Term` values, pointers, and composite types consisting of either of +these (both structures and arrays). + +The `llvm_term`, `jvm_term`, and `mir_term` functions create a `SetupValue`, +`JVMValue`, or `MIRValue`, respectively, from a `Term`: + +- `llvm_term : Term -> SetupValue` +- `jvm_term : Term -> JVMValue` +- `mir_term : Term -> MIRValue` + +The value that these functions return will have an LLVM, JVM, or MIR type +corresponding to the Cryptol type of the `Term` argument. (For more information +on this, refer to the "Cryptol type correspondence" section.) If the type does +not have a Cryptol counterpart, the function will raise an error. + +### Cryptol type correspondence + +The `{llvm,jvm,mir}_fresh_var` functions take an LLVM, JVM, or MIR type as an +argument and produces a `Term` variable of the corresponding Cryptol type as +output. Similarly, the `{llvm,jvm,mir}_term` functions take a Cryptol `Term` as +input and produce a value of the corresponding LLVM, JVM, or MIR type as +output. This section describes precisely which types can be converted to +Cryptol types (and vice versa) in this way. + +#### LLVM verification + +The following LLVM types correspond to Cryptol types: + +- `llvm_alias `: Corresponds to the same Cryptol type as the type used + in the definition of ``. +- `llvm_array `: Corresponds to the Cryptol sequence `[][]`, + where `` is the Cryptol type corresponding to ``. +- `llvm_int `: Corresponds to the Cryptol word `[]`. +- `llvm_struct_type [, ..., ]` and `llvm_packed_struct [, ..., ]`: + Corresponds to the Cryptol tuple `(, ..., )`, where `` + is the Cryptol type corresponding to `` for each `i` ranging from `1` + to `n`. + +The following LLVM types do _not_ correspond to Cryptol types: + +- `llvm_double` +- `llvm_float` +- `llvm_pointer` + +#### JVM verification + +The following Java types correspond to Cryptol types: + +- `java_array `: Corresponds to the Cryptol sequence `[][]`, + where `` is the Cryptol type corresponding to ``. +- `java_bool`: Corresponds to the Cryptol `Bit` type. +- `java_byte`: Corresponds to the Cryptol `[8]` type. +- `java_char`: Corresponds to the Cryptol `[16]` type. +- `java_int`: Corresponds to the Cryptol `[32]` type. +- `java_long`: Corresponds to the Cryptol `[64]` type. +- `java_short`: Corresponds to the Cryptol `[16]` type. + +The following Java types do _not_ correspond to Cryptol types: + +- `java_class` +- `java_double` +- `java_float` + +#### MIR verification + +The following MIR types correspond to Cryptol types: + +- `mir_array `: Corresponds to the Cryptol sequence `[][]`, + where `` is the Cryptol type corresponding to ``. +- `mir_bool`: Corresponds to the Cryptol `Bit` type. +- `mir_char`: Corresponds to the Cryptol `[32]` type. +- `mir_i8` and `mir_u8`: Corresponds to the Cryptol `[8]` type. +- `mir_i16` and `mir_u16`: Corresponds to the Cryptol `[16]` type. +- `mir_i32` and `mir_u32`: Corresponds to the Cryptol `[32]` type. +- `mir_i64` and `mir_u64`: Corresponds to the Cryptol `[64]` type. +- `mir_i128` and `mir_u128`: Corresponds to the Cryptol `[128]` type. +- `mir_isize` and `mir_usize`: Corresponds to the Cryptol `[32]` type. +- `mir_tuple [, ..., ]`: Corresponds to the Cryptol tuple + `(, ..., )`, where `` is the Cryptol type corresponding + to `` for each `i` ranging from `1` to `n`. + +The following MIR types do _not_ correspond to Cryptol types: + +- `mir_adt` +- `mir_f32` +- `mir_f64` +- `mir_ref` and `mir_ref_mut` +- `mir_slice` +- `mir_str` + +## Executing + +Once the initial state has been configured, the `{llvm,jvm,mir}_execute_func` +command specifies the parameters of the function being analyzed in terms +of the state elements already configured. + +- `llvm_execute_func : [SetupValue] -> LLVMSetup ()` +- `jvm_execute_func : [JVMValue] -> JVMSetup ()` +- `mir_execute_func : [MIRValue] -> MIRSetup ()` + +## Return Values + +To specify the value that should be returned by the function being +verified use the `{llvm,jvm,mir}_return` command. + +- `llvm_return : SetupValue -> LLVMSetup ()` +- `jvm_return : JVMValue -> JVMSetup ()` +- `mir_return : MIRValue -> MIRSetup ()` + +## A First Simple Example + +The commands introuduced so far are sufficient to verify simple programs +that do not use pointers (or that use them only internally). Consider, +for instance the C program that adds its two arguments together: + +:::{code-block} c +#include +uint32_t add(uint32_t x, uint32_t y) { + return x + y; +} +::: + +We can specify this function's expected behavior as follows: + +:::{code-block} sawscript +let add_setup = do { + x <- llvm_fresh_var "x" (llvm_int 32); + y <- llvm_fresh_var "y" (llvm_int 32); + llvm_execute_func [llvm_term x, llvm_term y]; + llvm_return (llvm_term {{ x + y : [32] }}); +}; +::: + +We can then compile the C file `add.c` into the bitcode file `add.bc` +and verify it with ABC: + +:::{code-block} sawscript +m <- llvm_load_module "add.bc"; +add_ms <- llvm_verify m "add" [] false add_setup abc; +::: + +(compositional-verification)= +## Compositional Verification + +The primary advantage of the specification-based approach to +verification is that it allows for compositional reasoning. That is, +when proving properties of a given method or function, we can make use +of properties we have already proved about its callees rather than +analyzing them anew. This enables us to reason about much larger +and more complex systems than otherwise possible. + +The `llvm_verify`, `jvm_verify`, and `mir_verify` functions return values of +type `CrucibleMethodSpec`, `JVMMethodSpec`, and `MIRMethodSpec`, respectively. +These values are opaque objects that internally contain both the information +provided in the associated `LLVMSetup`, `JVMSetup`, or `MIRSetup` blocks, +respectively, and the results of the verification process. + +Any of these `MethodSpec` objects can be passed in via the third +argument of the `..._verify` functions. For any function or method +specified by one of these parameters, the simulator will not follow +calls to the associated target. Instead, it will perform the following +steps: + +- Check that all `llvm_points_to` and `llvm_precond` statements + (or the corresponding JVM or MIR statements) in the specification are + satisfied. + +- Update the simulator state and optionally construct a return value as + described in the specification. + +More concretely, building on the previous example, say we have a +doubling function written in terms of `add`: + +:::{code-block} c +uint32_t dbl(uint32_t x) { + return add(x, x); +} +::: + +It has a similar specification to `add`: + +:::{code-block} sawscript +let dbl_setup = do { + x <- llvm_fresh_var "x" (llvm_int 32); + llvm_execute_func [llvm_term x]; + llvm_return (llvm_term {{ x + x : [32] }}); +}; +::: + +And we can verify it using what we've already proved about `add`: + +:::{code-block} sawscript +llvm_verify m "dbl" [add_ms] false dbl_setup abc; +::: + +In this case, doing the verification compositionally doesn't save +computational effort, since the functions are so simple, but it +illustrates the approach. + +### Compositional Verification and Mutable Allocations + +A common pitfall when using compositional verification is to reuse a +specification that underspecifies the value of a mutable allocation. In +general, doing so can lead to unsound verification, so SAW goes through great +lengths to check for this. + +Here is an example of this pitfall in an LLVM verification. Given this C code: + +::: c +void side_effect(uint32_t *a) { + *a = 0; +} + +uint32_t foo(uint32_t x) { + uint32_t b = x; + side_effect(&b); + return b; +} +::: + +And the following SAW specifications: + +:::{code-block} sawscript +let side_effect_spec = do { + a_ptr <- llvm_alloc (llvm_int 32); + a_val <- llvm_fresh_var "a_val" (llvm_int 32); + llvm_points_to a_ptr (llvm_term a_val); + + llvm_execute_func [a_ptr]; +}; + +let foo_spec = do { + x <- llvm_fresh_var "x" (llvm_int 32); + + llvm_execute_func [llvm_term x]; + + llvm_return (llvm_term x); +}; +::: + +Should SAW be able to verify the `foo` function against `foo_spec` using +compositional verification? That is, should the following be expected to work? + +:::{code-block} sawscript +side_effect_ov <- llvm_verify m "side_effect" [] false side_effect_spec z3; +llvm_verify m "foo" [side_effect_ov] false foo_spec z3; +::: + +A literal reading of `side_effect_spec` would suggest that the `side_effect` +function allocates `a_ptr` but then does nothing with it, implying that `foo` +returns its argument unchanged. This is incorrect, however, as the +`side_effect` function actually changes its argument to point to `0`, so the +`foo` function ought to return `0` as a result. SAW should not verify `foo` +against `foo_spec`, and indeed it does not. + +The problem is that `side_effect_spec` underspecifies the value of `a_ptr` in +its postconditions, which can lead to the potential unsoundness seen above when +`side_effect_spec` is used in compositional verification. To prevent this +source of unsoundness, SAW will _invalidate_ the underlying memory of any +mutable pointers (i.e., those declared with `llvm_alloc`, not +`llvm_alloc_global`) allocated in the preconditions of compositional override +that do not have a corresponding `llvm_points_to` statement in the +postconditions. Attempting to read from invalidated memory constitutes an +error, as can be seen in this portion of the error message when attempting to +verify `foo` against `foo_spec`: + +:::{code-block} console +invalidate (state of memory allocated in precondition (at side.saw:3:12) not described in postcondition) +::: + +To fix this particular issue, add an `llvm_points_to` statement to +`side_effect_spec`: + +:::{code-block} sawscript +let side_effect_spec = do { + a_ptr <- llvm_alloc (llvm_int 32); + a_val <- llvm_fresh_var "a_val" (llvm_int 32); + llvm_points_to a_ptr (llvm_term a_val); + + llvm_execute_func [a_ptr]; + + // This is new + llvm_points_to a_ptr (llvm_term {{ 0 : [32] }}); +}; +::: + +After making this change, SAW will reject `foo_spec` for a different reason, as +it claims that `foo` returns its argument unchanged when it actually returns +`0`. + +Note that invalidating memory itself does not constitute an error, so if the +`foo` function never read the value of `b` after calling `side_effect(&b)`, +then there would be no issue. It is only when a function attempts to _read_ +from invalidated memory that an error is thrown. In general, it can be +difficult to predict when a function will or will not read from invalidated +memory, however. For this reason, it is recommended to always specify the +values of mutable allocations in the postconditions of your specs, as it can +avoid pitfalls like the one above. + +The same pitfalls apply to compositional MIR verification, with a couple of key +differences. In MIR verification, mutable references are allocated using +`mir_alloc_mut`. Here is a Rust version of the pitfall program above: + +:::{code-block} rust +pub fn side_effect(a: &mut u32) { + *a = 0; +} + +pub fn foo(x: u32) -> u32 { + let mut b: u32 = x; + side_effect(&mut b); + b +} +::: + +:::{code-block} sawscript +let side_effect_spec = do { + a_ref <- mir_alloc_mut mir_u32; + a_val <- mir_fresh_var "a_val" mir_u32; + mir_points_to a_ref (mir_term a_val); + + mir_execute_func [a_ref]; +}; + +let foo_spec = do { + x <- mir_fresh_var "x" mir_u32; + + mir_execute_func [mir_term x]; + + mir_return (mir_term {{ x }}); +}; +::: + +Just like above, if you attempted to prove `foo` against `foo_spec` using +compositional verification: + +:::{code-block} sawscript +side_effect_ov <- mir_verify m "test::side_effect" [] false side_effect_spec z3; +mir_verify m "test::foo" [side_effect_ov] false foo_spec z3; +::: + +Then SAW would throw an error, as `side_effect_spec` underspecifies the value +of `a_ref` in its postconditions. `side_effect_spec` can similarly be repaired +by adding a `mir_points_to` statement involving `a_ref` in `side_effect_spec`'s +postconditions. + +MIR verification differs slightly from LLVM verification in how it catches +underspecified mutable allocations when using compositional overrides. The LLVM +memory model achieves this by invalidating the underlying memory in +underspecified allocations. The MIR memory model, on the other hand, does not +have a direct counterpart to memory invalidation. As a result, any MIR overrides +must specify the values of all mutable allocations in their postconditions, +_even if the function that calls the override never uses the allocations_. + +To illustrate this point more finely, suppose that the `foo` function had +instead been defined like this: + +:::{code-block} rust +pub fn foo(x: u32) -> u32 { + let mut b: u32 = x; + side_effect(&mut b); + 42 +} +::: + +Here, it does not particularly matter what effects the `side_effect` function +has on its argument, as `foo` will now return `42` regardless. Still, if you +attempt to prove `foo` by using `side_effect` as a compositional override, then +it is strictly required that you specify the value of `side_effect`'s argument +in its postconditions, even though the answer that `foo` returns is unaffected +by this. This is in contrast with LLVM verification, where one could get away +without specifying `side_effect`'s argument in this example, as the invalidated +memory in `b` would never be read. + +(compositional-verification-and-mutable-global-variables)= +### Compositional Verification and Mutable Global Variables + +Just like with local mutable allocations (see the previous section), +specifications used in compositional overrides must specify the values of +mutable global variables in their postconditions. To illustrate this using LLVM +verification, here is a variant of the C program from the previous example that +uses a mutable global variable `a`: + +:::{code-block} c +uint32_t a = 42; + +void side_effect(void) { + a = 0; +} + +uint32_t foo(void) { + side_effect(); + return a; +} +::: + +If we attempted to verify `foo` against this `foo_spec` specification using +compositional verification: + +:::{code-block} sawscript +let side_effect_spec = do { + llvm_alloc_global "a"; + llvm_points_to (llvm_global "a") (llvm_global_initializer "a"); + + llvm_execute_func []; +}; + +let foo_spec = do { + llvm_alloc_global "a"; + llvm_points_to (llvm_global "a") (llvm_global_initializer "a"); + + llvm_execute_func []; + + llvm_return (llvm_global_initializer "a"); +}; + +side_effect_ov <- llvm_verify m "side_effect" [] false side_effect_spec z3; +llvm_verify m "foo" [side_effect_ov] false foo_spec z3; +::: + +Then SAW would reject it, as `side_effect_spec` does not specify what `a`'s +value should be in its postconditions. Just as with local mutable allocations, +SAW will invalidate the underlying memory in `a`, and subsequently reading from +`a` in the `foo` function will throw an error. The solution is to add an +`llvm_points_to` statement in the postconditions that declares that `a`'s value +is set to `0`. + +The same concerns apply to MIR verification, where mutable global variables are +referred to as `static mut` items. (See the [MIR static +items](#mir-static-items) section for more information). Here is a Rust version +of the program above: + +:::{code-block} rust +static mut A: u32 = 42; + +pub fn side_effect() { + unsafe { + A = 0; + } +} + +pub fn foo() -> u32 { + side_effect(); + unsafe { A } +} +::: + +:::{code-block} sawscript +let side_effect_spec = do { + mir_points_to (mir_static "test::A") (mir_static_initializer "test::A"); + + mir_execute_func []; +}; + +let foo_spec = do { + mir_points_to (mir_static "test::A") (mir_static_initializer "test::A"); + + mir_execute_func []; + + mir_return (mir_static_initializer "test::A"); +}; + +side_effect_ov <- mir_verify m "side_effect" [] false side_effect_spec z3; +mir_verify m "foo" [side_effect_ov] false foo_spec z3; +::: + +Just as above, we can repair this by adding a `mir_points_to` statement in +`side_effect_spec`'s postconditions that specifies that `A` is set to `0`. + +Recall from the previous section that MIR verification is stricter than LLVM +verification when it comes to specifying mutable allocations in the +postconditions of compositional overrides. This is especially true for mutable +static items. In MIR verification, any compositional overrides must specify the +values of all mutable static items in the entire program in their +postconditions, _even if the function that calls the override never uses the +static items_. For example, if the `foo` function were instead defined like +this: + +:::{code-block} rust +pub fn foo() -> u32 { + side_effect(); + 42 +} +::: + +Then it is still required for `side_effect_spec` to specify what `A`'s value +will be in its postconditions, despite the fact that this has no effect on the +value that `foo` will return. + +## Specifying Heap Layout + +Most functions that operate on pointers expect that certain pointers +point to allocated memory before they are called. The `llvm_alloc` +command allows you to specify that a function expects a particular +pointer to refer to an allocated region appropriate for a specific type. + +- `llvm_alloc : LLVMType -> LLVMSetup SetupValue` + +This command returns a `SetupValue` consisting of a pointer to the +allocated space, which can be used wherever a pointer-valued +`SetupValue` can be used. + +In the initial state, `llvm_alloc` specifies that the function +expects a pointer to allocated space to exist. In the final state, it +specifies that the function itself performs an allocation. + +When using the experimental Java implementation, separate functions +exist for specifying that arrays or objects are allocated: + +- `jvm_alloc_array : Int -> JavaType -> JVMSetup JVMValue` specifies an +array of the given concrete size, with elements of the given type. + +- `jvm_alloc_object : String -> JVMSetup JVMValue` specifies an object +of the given class name. + +The experimental MIR implementation also has a `mir_alloc` function, which +behaves similarly to `llvm_alloc`. `mir_alloc` creates an immutable reference, +but there is also a `mir_alloc_mut` function for creating a mutable reference: + +- `mir_alloc : MIRType -> MIRSetup MIRValue` + +- `mir_alloc_mut : MIRType -> MIRSetup MIRValue` + +MIR tracks whether references are mutable or immutable at the type level, so it +is important to use the right allocation command for a given reference type. + +In LLVM, it's also possible to construct fresh pointers that do not +point to allocated memory (which can be useful for functions that +manipulate pointers but not the values they point to): + +- `llvm_fresh_pointer : LLVMType -> LLVMSetup SetupValue` + +The NULL pointer is called `llvm_null` in LLVM and `jvm_null` in +JVM: + +- `llvm_null : SetupValue` +- `jvm_null : JVMValue` + +One final, slightly more obscure command is the following: + +- `llvm_alloc_readonly : LLVMType -> LLVMSetup SetupValue` + +This works like `llvm_alloc` except that writes to the space +allocated are forbidden. This can be useful for specifying that a +function should take as an argument a pointer to allocated space that it +will not modify. Unlike `llvm_alloc`, regions allocated with +`llvm_alloc_readonly` are allowed to alias other read-only regions. + +## Specifying Heap Values + +Pointers returned by `llvm_alloc`, `jvm_alloc_{array,object}`, or +`mir_alloc{,_mut}` don't initially point to anything. So if you pass such a +pointer directly into a function that tried to dereference it, symbolic +execution will fail with a message about an invalid load. For some functions, +such as those that are intended to initialize data structures (writing to the +memory pointed to, but never reading from it), this sort of uninitialized +memory is appropriate. In most cases, however, it's more useful to state that a +pointer points to some specific (usually symbolic) value, which you can do with +the _points-to_ family of commands. + +### LLVM heap values + +LLVM verification primarily uses the `llvm_points_to` command: + +- `llvm_points_to : SetupValue -> SetupValue -> LLVMSetup ()` +takes two `SetupValue` arguments, the first of which must be a pointer, +and states that the memory specified by that pointer should contain the +value given in the second argument (which may be any type of +`SetupValue`). + +When used in the final state, `llvm_points_to` specifies that the +given pointer _should_ point to the given value when the function +finishes. + +Occasionally, because C programs frequently reinterpret memory of one +type as another through casts, it can be useful to specify that a +pointer points to a value that does not agree with its static type. + +- `llvm_points_to_untyped : SetupValue -> SetupValue -> +LLVMSetup ()` works like `llvm_points_to` but omits type +checking. Rather than omitting type checking across the board, we +introduced this additional function to make it clear when a type +reinterpretation is intentional. As an alternative, one +may instead use `llvm_cast_pointer` to line up the static types. + +### JVM heap values + +JVM verification has two categories of commands for specifying heap values. +One category consists of the `jvm_*_is` commands, which allow users to directly +specify what value a heap object points to. There are specific commands for +each type of JVM heap object: + +- `jvm_array_is : JVMValue -> Term -> JVMSetup ()` declares that an array (the + first argument) contains a sequence of values (the second argument). +- `jvm_elem_is : JVMValue -> Int -> JVMValue -> JVMSetup ()` declares that an + array (the first argument) has an element at the given index (the second + argument) containing the given value (the third argument). +- `jvm_field_is : JVMValue -> String -> JVMValue -> JVMSetup ()` declares that + an object (the first argument) has a field (the second argument) containing + the given value (the third argument). +- `jvm_static_field_is : String -> JVMValue -> JVMSetup ()` declares that a + named static field (the first argument) contains the given value (the second + argument). By default, the field name is assumed to belong to the same class + as the method being specified. Static fields belonging to other classes can + be selected using the `.` syntax in the first argument. + +Another category consists of the `jvm_modifies_*` commands. Like the `jvm_*_is` +commands, these specify that a JVM heap object points to valid memory, but +unlike the `jvm_*_is` commands, they leave the exact value being pointed to as +unspecified. These are useful for writing partial specifications for methods +that modify some heap value, but without saying anything specific about the new +value. + +- `jvm_modifies_array : JVMValue -> JVMSetup ()` +- `jvm_modifies_elem : JVMValue -> Int -> JVMSetup ()` +- `jvm_modifies_field : JVMValue -> String -> JVMSetup ()` +- `jvm_modifies_static_field : String -> JVMSetup ()` + +### MIR heap values + +MIR verification has a single `mir_points_to` command: + +- `mir_points_to : MIRValue -> MIRValue -> MIRSetup ()` +takes two `SetupValue` arguments, the first of which must be a reference, +and states that the memory specified by that reference should contain the +value given in the second argument (which may be any type of +`SetupValue`). + +## Working with Compound Types + +The commands mentioned so far give us no way to specify the values of +compound types (arrays or `struct`s). Compound values can be dealt with +either piecewise or in their entirety. + +- `llvm_elem : SetupValue -> Int -> SetupValue` yields a pointer to +an internal element of a compound value. For arrays, the `Int` parameter +is the array index. For `struct` values, it is the field index. + +- `llvm_field : SetupValue -> String -> SetupValue` yields a pointer +to a particular named `struct` field, if debugging information is +available in the bitcode. + +Either of these functions can be used with `llvm_points_to` to +specify the value of a particular array element or `struct` field. +Sometimes, however, it is more convenient to specify all array elements +or field values at once. The `llvm_array_value` and `llvm_struct_value` +functions construct compound values from lists of element values. + +- `llvm_array_value : [SetupValue] -> SetupValue` +- `llvm_struct_value : [SetupValue] -> SetupValue` + +To specify an array or struct in which each element or field is +symbolic, it would be possible, but tedious, to use a large combination +of `llvm_fresh_var` and `llvm_elem` or `llvm_field` commands. +However, the following function can simplify the common case +where you want every element or field to have a fresh value. + +- `llvm_fresh_expanded_val : LLVMType -> LLVMSetup SetupValue` + +The `llvm_struct_value` function normally creates a `struct` whose layout +obeys the alignment rules of the platform specified in the LLVM file +being analyzed. Structs in LLVM can explicitly be "packed", however, so +that every field immediately follows the previous in memory. The +following command will create values of such types: + +- `llvm_packed_struct_value : [SetupValue] -> SetupValue` + +C programs will sometimes make use of pointer casting to implement +various kinds of polymorphic behaviors, either via direct pointer +casts, or by using `union` types to codify the pattern. To reason +about such cases, the following operation is useful. + +- `llvm_cast_pointer : SetupValue -> LLVMType -> SetupValue` + +This function function casts the type of the input value (which must be a +pointer) so that it points to values of the given type. This mainly +affects the results of subsequent `llvm_field` and `llvm_elem` calls, +and any eventual `points_to` statements that the resulting pointer +flows into. This is especially useful for dealing with C `union` +types, as the type information provided by LLVM is imprecise in these +cases. + +We can automate the process of applying pointer casts if we have debug +information avaliable: + +- `llvm_union : SetupValue -> String -> SetupValue` + +Given a pointer setup value, this attempts to select the named union +branch and cast the type of the pointer. For this to work, debug +symbols must be included; moreover, the process of correlating LLVM +type information with information contained in debug symbols is a bit +heuristic. If `llvm_union` cannot figure out how to cast a pointer, +one can fall back on the more manual `llvm_cast_pointer` instead. + +In the experimental Java verification implementation, the following +functions can be used to state the equivalent of a combination of +`llvm_points_to` and either `llvm_elem` or `llvm_field`. + +- `jvm_elem_is : JVMValue -> Int -> JVMValue -> JVMSetup ()` specifies +the value of an array element. + +- `jvm_field_is : JVMValue -> String -> JVMValue -> JVMSetup ()` +specifies the name of an object field. + +In the experimental MIR verification implementation, the following functions +construct compound values: + +- `mir_array_value : MIRType -> [MIRValue] -> MIRValue` constructs an array + of the given type whose elements consist of the given values. Supplying the + element type is necessary to support length-0 arrays. +- `mir_enum_value : MIRAdt -> String -> [MIRValue] -> MIRValue` constructs an + enum using a particular enum variant. The `MIRAdt` arguments determines what + enum type to create, the `String` value determines the name of the variant to + use, and the `[MIRValue]` list are the values to use as elements in the + variant. + + See the "Finding MIR algebraic data types" section (as well as the "Enums" + subsection) for more information on how to compute a `MIRAdt` value to pass + to `mir_enum_value`. +- `mir_slice_value : MIRValue -> MIRValue`: see the "MIR slices" section below. +- `mir_slice_range_value : MIRValue -> Int -> Int -> MIRValue`: see the + "MIR slices" section below. +- `mir_str_slice_value : MIRValue -> MIRValue`: see the "MIR slices" section + below. +- `mir_str_slice_range_value : MIRValue -> Int -> Int -> MIRValue`: see the + "MIR slices" section below. +- `mir_struct_value : MIRAdt -> [MIRValue] -> MIRValue` construct a struct + with the given list of values as elements. The `MIRAdt` argument determines + what struct type to create. + + See the "Finding MIR algebraic data types" section for more information on how + to compute a `MIRAdt` value to pass to `mir_struct_value`. +- `mir_tuple_value : [MIRValue] -> MIRValue` construct a tuple with the given + list of values as elements. + +To specify a compound value in which each element or field is symbolic, it +would be possible, but tedious, to use a large number of `mir_fresh_var` +invocations in conjunction with the commands above. However, the following +function can simplify the common case where you want every element or field to +have a fresh value: + +- `mir_fresh_expanded_value : String -> MIRType -> MIRSetup MIRValue` + +The `String` argument denotes a prefix to use when generating the names of +fresh symbolic variables. The `MIRType` can be any type, with the exception of +reference types (or compound types that contain references as elements or +fields), which are not currently supported. + +### MIR slices + +Slices are a unique form of compound type that is currently only used during +MIR verification. Unlike other forms of compound values, such as arrays, it is +not possible to directly construct a slice. Instead, one must take a slice of +an existing reference value that points to the thing being sliced. + +SAW currently supports taking slices of arrays and strings. + +#### Array slices + +The following commands are used to construct slices of arrays: + +- `mir_slice_value : MIRValue -> MIRValue`: the SAWScript expression + `mir_slice_value base` is equivalent to the Rust expression `&base[..]`, + i.e., a slice of the entirety of `base`. `base` must be a reference to an + array value (`&[T; N]` or `&mut [T; N]`), not an array itself. The type of + `mir_slice_value base` will be `&[T]` (if `base` is an immutable reference) + or `&mut [T]` (if `base` is a mutable reference). +- `mir_slice_range_value : MIRValue -> Int -> Int -> MIRValue`: the SAWScript + expression `mir_slice_range_value base start end` is equivalent to the Rust + expression `&base[start..end]`, i.e., a slice over a part of `base` which + ranges from `start` to `end`. `base` must be a reference to an array value + (`&[T; N]` or `&mut [T; N]`), not an array itself. The type of + `mir_slice_value base` will be `&[T]` (if `base` is an immutable reference) + or `&mut [T]` (if `base` is a mutable reference). + + `start` and `end` are assumed to be zero-indexed. `start` must not exceed + `end`, and `end` must not exceed the length of the array that `base` points + to. + +As an example of how to use these functions, consider this Rust function, which +accepts an arbitrary slice as an argument: + +:::{code-block} rust +pub fn f(s: &[u32]) -> u32 { + s[0] + s[1] +} +::: + +We can write a specification that passes a slice to the array `[1, 2, 3, 4, 5]` +as an argument to `f`: + +:::{code-block} sawscript +let f_spec_1 = do { + a <- mir_alloc (mir_array 5 mir_u32); + mir_points_to a (mir_term {{ [1, 2, 3, 4, 5] : [5][32] }}); + + mir_execute_func [mir_slice_value a]; + + mir_return (mir_term {{ 3 : [32] }}); +}; +::: + +Alternatively, we can write a specification that passes a part of this array +over the range `[1..3]`, i.e., ranging from second element to the fourth. +Because this is a half-open range, the resulting slice has length 2: + +:::{code-block} sawscript +let f_spec_2 = do { + a <- mir_alloc (mir_array 5 mir_u32); + mir_points_to a (mir_term {{ [1, 2, 3, 4, 5] : [5][32] }}); + + mir_execute_func [mir_slice_range_value a 1 3]; + + mir_return (mir_term {{ 5 : [32] }}); +}; +::: + +Note that we are passing _references_ of arrays to `mir_slice_value` and +`mir_slice_range_value`. It would be an error to pass a bare array to these +functions, so the following specification would be invalid: + +::: +let f_fail_spec_ = do { + let arr = mir_term {{ [1, 2, 3, 4, 5] : [5][32] }}; + + mir_execute_func [mir_slice_value arr]; // BAD: `arr` is not a reference + + mir_return (mir_term {{ 3 : [32] }}); +}; +::: + +Note that The `mir_slice_range_value` function must accept bare `Int` arguments +to specify the lower and upper bounds of the range. A consequence of this +design is that it is not possible to create a slice with a symbolic length. If +this limitation prevents you from using SAW, please file an issue [on +GitHub](https://github.com/GaloisInc/saw-script/issues). + +#### String slices + +In addition to slices of arrays (i.e., of type `&[T]`), SAW also supports +slices of strings (i.e., of type `&str`) through the following commands: + +- `mir_str_slice_value : MIRValue -> MIRValue`: the SAWScript expression + `mir_str_slice_value base` is equivalent to the Rust expression `&base[..]`, + i.e., a slice of the entirety of `base`. `base` must be a reference to an + array of bytes (`&[u8; N]` or `&mut [u8; N]`), not an array itself. The type + of `mir_str_slice_value base` will be `&str` (if `base` is an immutable + reference) or `&mut str` (if `base` is a mutable reference). +- `mir_str_slice_range_value : MIRValue -> Int -> Int -> MIRValue`: the + SAWScript expression `mir_slice_range_value base start end` is equivalent to + the Rust expression `&base[start..end]`, i.e., a slice over a part of `base` + which ranges from `start` to `end`. `base` must be a reference to an array of + bytes (`&[u8; N]` or `&mut [u8; N]`), not an array itself. The type of + `mir_slice_value base` will be `&str` (if `base` is an immutable reference) + or `&mut str` (if `base` is a mutable reference). + + `start` and `end` are assumed to be zero-indexed. `start` must not exceed + `end`, and `end` must not exceed the length of the array that `base` points + to. + +One unusual requirement about `mir_str_slice_value` and +`mir_str_slice_range_value` is that they require the argument to be of type +`&[u8; N]`, i.e., a reference to an array of bytes. This is an artifact of the +way that strings are encoded in Cryptol. The following Cryptol expressions: + +- `"A"` +- `"123"` +- `"Hello World"` + +Have the following types: + +- `[1][8]` +- `[3][8]` +- `[11][8]` + +This is because Cryptol strings are syntactic shorthand for sequences of bytes. +The following Cryptol expressions are wholly equivalent: + +- `[0x41]` +- `[0x31, 0x32, 0x33]` +- `[0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64]` + +These represent the strings in the extended ASCII character encoding. The +Cryptol sequence type `[N][8]` is equivalent to the Rust type `[u8; N]`, so the +requirement to have something of type `&[u8; N]` as an argument reflects this +design choice. + +Note that `mir_str_slice_value ` is _not_ the same thing as +`mir_slice_value `, as the two commands represent different types +of Rust values. While both commands take a `` as an argument, +`mir_str_slice_value` will return a value of Rust type `&str` (or `&mut str`), +whereas `mir_slice_value` will return a value of Rust type `&[u8]` (or `&mut +[u8]`). These Rust types are checked when you pass these values as arguments to +Rust functions (using `mir_execute_func`) or when you return these values +(using `mir_return`), and it is an error to supply a `&str` value in a place +where a `&[u8]` value is expected (and vice versa). + +As an example of how to write specifications involving string slices, +consider this Rust function: + +:::{code-block} rust +pub fn my_len(s: &str) -> usize { + s.len() +} +::: + +We can use `mir_str_slice_value` to write a specification for `my_len` when it +is given the string `"hello"` as an argument: + +:::{code-block} sawscript +let my_len_spec = do { + s <- mir_alloc (mir_array 5 mir_u8); + mir_points_to s (mir_term {{ "hello" }}); + + mir_execute_func [mir_str_slice_value s]; + + mir_return (mir_term {{ 5 : [32] }}); +}; +::: + +Currently, Cryptol only supports characters that can be encoded in a single +byte. As a result, it is not currently possible to take slices of strings with +certain characters. For example, the string `"roșu"` cannot be used as a +Cryptol expression, as the character `'ș'` would require 10 bits to represent +instead of 8. The alternative is to use UTF-8 to encode such characters. For +instance, the UTF-8 encoding of the string `"roșu"` is `"ro\200\153u"`, where +`"\200\153"` is a sequence of two bytes that represents the `'ș'` character. + +SAW makes no attempt to ensure that string slices over a particular range +aligns with UTF-8 character boundaries. For example, the following Rust code +would panic: + +:::{code-block} rust + let rosu: &str = "roșu"; + let s: &str = &rosu[0..3]; + println!("{:?}", s); +::: + +:::{code-block} console +thread 'main' panicked at 'byte index 3 is not a char boundary; it is inside 'ș' (bytes 2..4) of `roșu`' +::: + +On the other hand, SAW will allow you define a slice of the form +`mir_str_slice_range r 0 3`, where `r` is a reference to `"ro\200\153u"`. It is +the responsibility of the SAW user to ensure that `mir_str_slice_range` indices +align with character boundaries. + +### Finding MIR algebraic data types + +We collectively refer to MIR `struct`s and `enum`s together as _algebraic data +types_, or ADTs for short. ADTs have identifiers to tell them apart, and a +single ADT declaration can give rise to multiple identifiers depending on how +the declaration is used. For example: + +:::{code-block} rust +pub struct S { + pub x: A, + pub y: B, +} + +pub fn f() -> S { + S { + x: 1, + y: 2, + } +} + +pub fn g() -> S { + S { + x: 3, + y: 4, + } +} +::: + +This program as a single `struct` declaration `S`, which is used in the +functions `f` and `g`. Note that `S`'s declaration is _polymorphic_, as it uses +type parameters, but the uses of `S` in `f` and `g` are _monomorphic_, as `S`'s +type parameters are fully instantiated. Each unique, monomorphic instantiation +of an ADT gives rise to its own identifier. In the example above, this might +mean that the following identifiers are created when this code is compiled with +`mir-json`: + +- `S` gives rise to `example/abcd123::S::_adt456` +- `S` gives rise to `example/abcd123::S::_adt789` + +The suffix `_adt` is autogenerated by `mir-json` and is typically +difficult for humans to guess. For this reason, we offer a command to look up +an ADT more easily: + +- `mir_find_adt : MIRModule -> String -> [MIRType] -> MIRAdt` consults the + given `MIRModule` to find an algebraic data type (`MIRAdt`). It uses the given + `String` as an identifier and the given MIRTypes as the types to instantiate + the type parameters of the ADT. If such a `MIRAdt` cannot be found in the + `MIRModule`, this will raise an error. + +Note that the `String` argument to `mir_find_adt` does not need to include the +`_adt` suffix, as `mir_find_adt` will discover this for you. The `String` +is expected to adhere to the identifier conventions described in the "Running a +MIR-based verification" section. For instance, the following two lines will +look up `S` and `S` from the example above as `MIRAdt`s: + +:::{code-block} sawscript +m <- mir_load_module "example.linked-mir.json"; + +let s_8_16 = mir_find_adt m "example::S" [mir_u8, mir_u16]; +let s_32_64 = mir_find_adt m "example::S" [mir_u32, mir_u64]; +::: + +The `mir_adt` command (for constructing a struct type), `mir_struct_value` (for +constructing a struct value), and `mir_enum_value` (for constructing an enum +value) commands in turn take a `MIRAdt` as an argument. + +#### Enums + +In addition to taking a `MIRAdt` as an argument, `mir_enum_value` also takes a +`String` representing the name of the variant to construct. The variant name +should be a short name such as `"None"` or `"Some"`, and not a full identifier +such as `"core::option::Option::None"` or `"core::option::Option::Some"`. This +is because the `MIRAdt` already contains the full identifiers for all of an +enum's variants, so SAW will use this information to look up a variant's +identifier from a short name. Here is an example of using `mir_enum_value` in +practice: + +:::{code-block} rust +pub fn n() -> Option { + None +} + +pub fn s(x: u32) -> Option { + Some(x) +} +::: + +:::{code-block} sawscript +m <- mir_load_module "example.linked-mir.json"; + +let option_u32 = mir_find_adt m "core::option::Option" [mir_u32]; + +let n_spec = do { + mir_execute_func []; + + mir_return (mir_enum_value option_u32 "None" []); +}; + +let s_spec = do { + x <- mir_fresh_var "x" mir_u32; + + mir_execute_func [mir_term x]; + + mir_return (mir_enum_value option_u32 "Some" [mir_term x]); +}; +::: + +Note that `mir_enum_value` can only be used to construct a specific variant. If +you need to construct a symbolic enum value that can range over many potential +variants, use `mir_fresh_expanded_value` instead. + +#### Lifetimes + +Rust ADTs can have both type parameters as well as _lifetime_ parameters. The +following Rust code declares a lifetime parameter `'a` on the struct `S`, as +well on the function `f` that computes an `S` value: + +:::{code-block} rust +pub struct S<'a> { + pub x: &'a u32, +} + +pub fn f<'a>(y: &'a u32) -> S<'a> { + S { x: y } +} +::: + +When `mir-json` compiles a piece of Rust code that contains lifetime +parameters, it will instantiate all of the lifetime parameters with a +placeholder MIR type that is simply called `lifetime`. This is important to +keep in mind when looking up ADTs with `mir_find_adt`, as you will also need to +indicate to SAW that the lifetime parameter is instantiated with `lifetime`. In +order to do so, use `mir_lifetime`. For example, here is how to look up `S` +with `'a` instantiated to `lifetime`: + +:::{code-block} sawscript +s_adt = mir_find_adt m "example::S" [mir_lifetime] +::: + +Note that this part of SAW's design is subject to change in the future. +Ideally, users would not have to care about lifetimes at all at the MIR level; +see [this issue](https://github.com/GaloisInc/mir-json/issues/58) for further +discussion on this point. If that issue is fixed, then we will likely remove +`mir_lifetime`, as it will no longer be necessary. + +### Bitfields + +SAW has experimental support for specifying `struct`s with bitfields, such as +in the following example: + +:::{code-block} c +struct s { + uint8_t x:1; + uint8_t y:1; +}; +::: + +Normally, a `struct` with two `uint8_t` fields would have an overall size of +two bytes. However, because the `x` and `y` fields are declared with bitfield +syntax, they are instead packed together into a single byte. + +Because bitfields have somewhat unusual memory representations in LLVM, some +special care is required to write SAW specifications involving bitfields. For +this reason, there is a dedicated `llvm_points_to_bitfield` function for this +purpose: + +- `llvm_points_to_bitfield : SetupValue -> String -> SetupValue -> LLVMSetup ()` + +The type of `llvm_points_to_bitfield` is similar that of `llvm_points_to`, +except that it takes the name of a field within a bitfield as an additional +argument. For example, here is how to assert that the `y` field in the `struct` +example above should be `0`: + +:::{code-block} sawscript +ss <- llvm_alloc (llvm_alias "struct.s"); +llvm_points_to_bitfield ss "y" (llvm_term {{ 0 : [1] }}); +::: + +Note that the type of the right-hand side value (`0`, in this example) must +be a bitvector whose length is equal to the size of the field within the +bitfield. In this example, the `y` field was declared as `y:1`, so `y`'s value +must be of type `[1]`. + +Note that the following specification is _not_ equivalent to the one above: + +:::{code-block} sawscript +ss <- llvm_alloc (llvm_alias "struct.s"); +llvm_points_to (llvm_field ss "y") (llvm_term {{ 0 : [1] }}); +::: + +`llvm_points_to` works quite differently from `llvm_points_to_bitfield` under +the hood, so using `llvm_points_to` on bitfields will almost certainly not work +as expected. + +In order to use `llvm_points_to_bitfield`, one must also use the +`enable_lax_loads_and_stores` command: + +- `enable_lax_loads_and_stores: TopLevel ()` + +Both `llvm_points_to_bitfield` and `enable_lax_loads_and_stores` are +experimental commands, so these also require using `enable_experimental` before +they can be used. + +The `enable_lax_loads_and_stores` command relaxes some +of SAW's assumptions about uninitialized memory, which is necessary to make +`llvm_points_to_bitfield` work under the hood. For example, reading from +uninitialized memory normally results in an error in SAW, but with +`enable_lax_loads_and_stores`, such a read will instead return a symbolic +value. At present, `enable_lax_loads_and_stores` only works with What4-based +tactics (e.g., `w4_unint_z3`); using it with SBV-based tactics +(e.g., `sbv_unint_z3`) will result in an error. + +Note that SAW relies on LLVM debug metadata in order to determine which struct +fields reside within a bitfield. As a result, you must pass `-g` to `clang` when +compiling code involving bitfields in order for SAW to be able to reason about +them. + +## Global variables + +SAW supports verifying LLVM and MIR specifications involving global variables. + +### LLVM global variables + +Mutable global variables that are accessed in a function must first be allocated +by calling `llvm_alloc_global` on the name of the global. + +- `llvm_alloc_global : String -> LLVMSetup ()` + +This ensures that all global variables that might influence the function are +accounted for explicitly in the specification: if `llvm_alloc_global` is +used in the precondition, there must be a corresponding `llvm_points_to` +in the postcondition describing the new state of that global. Otherwise, a +specification might not fully capture the behavior of the function, potentially +leading to unsoundness in the presence of compositional verification. (For more +details on this point, see the [Compositional Verification and Mutable Global +Variables](#compositional-verification-and-mutable-global-variables) section.) + +Immutable (i.e. `const`) global variables are allocated implicitly, and do not +require a call to `llvm_alloc_global`. + +Pointers to global variables or functions can be accessed with +`llvm_global`: + +- `llvm_global : String -> SetupValue` + +Like the pointers returned by `llvm_alloc`, however, these aren't +initialized at the beginning of symbolic -- setting global variables may +be unsound in the presence of [compositional +verification](#compositional-verification). + +To understand the issues surrounding global variables, consider the following C +code: + + +:::{code-block} c +int x = 0; + +int f(int y) { + x = x + 1; + return x + y; +} + +int g(int z) { + x = x + 2; + return x + z; +} +::: + +One might initially write the following specifications for `f` and `g`: + + +:::{code-block} sawscript +m <- llvm_load_module "./test.bc"; + +f_spec <- llvm_verify m "f" [] true (do { + y <- llvm_fresh_var "y" (llvm_int 32); + llvm_execute_func [llvm_term y]; + llvm_return (llvm_term {{ 1 + y : [32] }}); +}) abc; + +g_spec <- llvm_llvm_verify m "g" [] true (do { + z <- llvm_fresh_var "z" (llvm_int 32); + llvm_execute_func [llvm_term z]; + llvm_return (llvm_term {{ 2 + z : [32] }}); +}) abc; +::: + +If globals were always initialized at the beginning of verification, +both of these specs would be provable. However, the results wouldn't +truly be compositional. For instance, it's not the case that `f(g(z)) == +z + 3` for all `z`, because both `f` and `g` modify the global variable +`x` in a way that crosses function boundaries. + +To deal with this, we can use the following function: + +- `llvm_global_initializer : String -> SetupValue` returns the value + of the constant global initializer for the named global variable. + +Given this function, the specifications for `f` and `g` can make this +reliance on the initial value of `x` explicit: + + +:::{code-block} sawscript +m <- llvm_load_module "./test.bc"; + + +let init_global name = do { + llvm_alloc_global name; + llvm_points_to (llvm_global name) + (llvm_global_initializer name); +}; + +f_spec <- llvm_verify m "f" [] true (do { + y <- llvm_fresh_var "y" (llvm_int 32); + init_global "x"; + llvm_precond {{ y < 2^^31 - 1 }}; + llvm_execute_func [llvm_term y]; + llvm_return (llvm_term {{ 1 + y : [32] }}); +}) abc; +::: + +which initializes `x` to whatever it is initialized to in the C code at +the beginning of verification. This specification is now safe for +compositional verification: SAW won't use the specification `f_spec` +unless it can determine that `x` still has its initial value at the +point of a call to `f`. This specification also constrains `y` to prevent +signed integer overflow resulting from the `x + y` expression in `f`, +which is undefined behavior in C. + +(mir-static-items)= +### MIR static items + +Rust's static items are the MIR version of global variables. A reference to a +static item can be accessed with the `mir_static` function. This function takes +a `String` representing a static item's identifier, and this identifier is +expected to adhere to the naming conventions outlined in the "Running a +MIR-based verification" section: + +- `mir_static : String -> MIRValue` + +References to static values can be initialized with the `mir_points_to` +command, just like with other forms of references. Immutable static items +(e.g., `static X: u8 = 42`) are initialized implicitly in every SAW +specification, so there is no need for users to do so manually. Mutable static +items (e.g., `static mut Y: u8 = 27`), on the other hand, are _not_ initialized +implicitly, and users must explicitly pick a value to initialize them with. + +The `mir_static_initializer` function can be used to access the initial value +of a static item in a MIR program. Like with `mir_static`, the `String` +supplied as an argument must be a valid identifier: + +- `mir_static_initializer : String -> MIRValue`. + +As an example of how to use these functions, here is a Rust program involving +static items: + +:::{code-block} rust +// statics.rs +static S1: u8 = 1; +static mut S2: u8 = 2; + +pub fn f() -> u8 { + // Reading a mutable static item requires an `unsafe` block due to + // concurrency-related concerns. We are only concerned about the behavior + // of this program in a single-threaded context, so this is fine. + let s2 = unsafe { S2 }; + S1 + s2 +} +::: + +We can write a specification for `f` like so: + +:::{code-block} sawscript +// statics.saw +enable_experimental; + +let f_spec = do { + mir_points_to (mir_static "statics::S2") + (mir_static_initializer "statics::S2"); + // Note that we do not initialize S1, as immutable static items are implicitly + // initialized in every specification. + + mir_execute_func []; + + mir_return (mir_term {{ 3 : [8] }}); +}; + +m <- mir_load_module "statics.linked-mir.json"; + +mir_verify m "statics::f" [] false f_spec z3; +::: + +In order to use a specification involving mutable static items for +compositional verification, it is required to specify the value of all mutable +static items using the `mir_points_to` command in the specification's +postconditions. For more details on this point, see the [Compositional +Verification and Mutable Global +Variables](#compositional-verification-and-mutable-global-variables) section. + +## Preconditions and Postconditions + +Sometimes a function is only well-defined under certain conditions, or +sometimes you may be interested in certain initial conditions that give +rise to specific final conditions. For these cases, you can specify an +arbitrary predicate as a precondition or post-condition, using any +values in scope at the time. + +- `llvm_precond : Term -> LLVMSetup ()` +- `llvm_postcond : Term -> LLVMSetup ()` +- `llvm_assert : Term -> LLVMSetup ()` +- `jvm_precond : Term -> JVMSetup ()` +- `jvm_postcond : Term -> JVMSetup ()` +- `jvm_assert : Term -> JVMSetup ()` +- `mir_precond : Term -> MIRSetup ()` +- `mir_postcond : Term -> MIRSetup ()` +- `mir_assert : Term -> MIRSetup ()` + +These commands take `Term` arguments, and therefore cannot describe the values +of pointers. The "assert" variants will work in either pre- or post-conditions, +and are useful when defining helper functions that, e.g., provide datastructure +invariants that make sense in both phases. The `{llvm,jvm,mir}_equal` commands +state that two values should be equal, and can be used in either the initial or +the final state. + +- `llvm_equal : SetupValue -> SetupValue -> LLVMSetup ()` +- `jvm_equal : JVMValue -> JVMValue -> JVMSetup ()` +- `mir_equal : MIRValue -> MIRValue -> MIRSetup ()` + +The use of `{llvm,jvm,mir}_equal` can also sometimes lead to more efficient +symbolic execution when the predicate of interest is an equality. + +## Assuming specifications + +Normally, a `MethodSpec` is the result of both simulation and proof of +the target code. However, in some cases, it can be useful to use a +`MethodSpec` to specify some code that either doesn't exist or is hard +to prove. The previously-mentioned [`assume_unsat` +tactic](#finishing-proofs-without-external-solvers) omits proof but does not prevent +simulation of the function. To skip simulation altogether, one can use +one of the following commands: + + +- `llvm_unsafe_assume_spec : LLVMModule -> String -> LLVMSetup () -> TopLevel CrucibleMethodSpec` +- `jvm_unsafe_assume_spec : JavaClass -> String -> JVMSetup () -> TopLevel JVMMethodSpec` +- `mir_unsafe_assume_spec : MIRModule -> String -> MIRSetup () -> TopLevel MIRSpec` + +## A Heap-Based Example + +To tie all of the command descriptions from the previous sections +together, consider the case of verifying the correctness of a C program +that computes the dot product of two vectors, where the length and value +of each vector are encapsulated together in a `struct`. + +The dot product can be concisely specified in Cryptol as follows: + +:::{code-block} cryptol +dotprod : {n, a} (fin n, fin a) => [n][a] -> [n][a] -> [a] +dotprod xs ys = sum (zip (*) xs ys) +::: + +To implement this in C, let's first consider the type of vectors: + +:::{code-block} c +typedef struct { + uint32_t *elts; + uint32_t size; +} vec_t; +::: + +This struct contains a pointer to an array of 32-bit elements, and a +32-bit value indicating how many elements that array has. + +We can compute the dot product of two of these vectors with the +following C code (which uses the size of the shorter vector if they +differ in size). + +:::{code-block} c +uint32_t dotprod_struct(vec_t *x, vec_t *y) { + uint32_t size = MIN(x->size, y->size); + uint32_t res = 0; + for(size_t i = 0; i < size; i++) { + res += x->elts[i] * y->elts[i]; + } + return res; +} +::: + +The entirety of this implementation can be found in the +`examples/llvm/dotprod_struct.c` file in the `saw-script` repository. + +To verify this program in SAW, it will be convenient to define a couple +of utility functions (which are generally useful for many +heap-manipulating programs). First, combining allocation and +initialization to a specific value can make many scripts more concise: + +:::{code-block} sawscript +let alloc_init ty v = do { + p <- llvm_alloc ty; + llvm_points_to p v; + return p; +}; +::: + +This creates a pointer `p` pointing to enough space to store type `ty`, +and then indicates that the pointer points to value `v` (which should be +of that same type). + +A common case for allocation and initialization together is when the +initial value should be entirely symbolic. + +:::{code-block} sawscript +let ptr_to_fresh n ty = do { + x <- llvm_fresh_var n ty; + p <- alloc_init ty (llvm_term x); + return (x, p); +}; +::: + +This function returns the pointer just allocated along with the fresh +symbolic value it points to. + +Given these two utility functions, the `dotprod_struct` function can be +specified as follows: + +:::{code-block} sawscript +let dotprod_spec n = do { + let nt = llvm_term {{ `n : [32] }}; + (xs, xsp) <- ptr_to_fresh "xs" (llvm_array n (llvm_int 32)); + (ys, ysp) <- ptr_to_fresh "ys" (llvm_array n (llvm_int 32)); + let xval = llvm_struct_value [ xsp, nt ]; + let yval = llvm_struct_value [ ysp, nt ]; + xp <- alloc_init (llvm_alias "struct.vec_t") xval; + yp <- alloc_init (llvm_alias "struct.vec_t") yval; + llvm_execute_func [xp, yp]; + llvm_return (llvm_term {{ dotprod xs ys }}); +}; +::: + +Any instantiation of this specification is for a specific vector length +`n`, and assumes that both input vectors have that length. That length +`n` automatically becomes a type variable in the subsequent Cryptol +expressions, and the backtick operator is used to reify that type as a +bit vector of length 32. + +The entire script can be found in the `dotprod_struct-crucible.saw` file +alongside `dotprod_struct.c`. + +Running this script results in the following: + +:::{code-block} console +Loading file "dotprod_struct.saw" +Proof succeeded! dotprod_struct +Registering override for `dotprod_struct` + variant `dotprod_struct` +Symbolic simulation completed with side conditions. +Proof succeeded! dotprod_wrap +::: + +## Using Ghost State + +In some cases, information relevant to verification is not directly +present in the concrete state of the program being verified. This can +happen for at least two reasons: + +- When providing specifications for external functions, for which source + code is not present. The external code may read and write global state + that is not directly accessible from the code being verified. + +- When the abstract specification of the program naturally uses a + different representation for some data than the concrete + implementation in the code being verified does. + +One solution to these problems is the use of _ghost_ state. This can be +thought of as additional global state that is visible only to the +verifier. Ghost state with a given name can be declared at the top level +with the following function: + +- `declare_ghost_state : String -> TopLevel Ghost` + +Ghost state variables do not initially have any particluar type, and can +store data of any type. Given an existing ghost variable the following +functions can be used to specify its value: + +- `llvm_ghost_value : Ghost -> Term -> LLVMSetup ()` +- `jvm_ghost_value : Ghost -> Term -> JVMSetup ()` +- `mir_ghost_value : Ghost -> Term -> MIRSetup ()` + +These can be used in either the pre state or the post state, to specify the +value of ghost state either before or after the execution of the function, +respectively. + +## An Extended Example + +To tie together many of the concepts in this manual, we now present a +non-trivial verification task in its entirety. All of the code for this example +can be found in the `examples/salsa20` directory of +[the SAWScript repository](https://github.com/GaloisInc/saw-script). + +### Salsa20 Overview + +Salsa20 is a stream cipher developed in 2005 by Daniel J. Bernstein, built on a +pseudorandom function utilizing add-rotate-XOR (ARX) operations on 32-bit +words[^4]. Bernstein himself has provided several public domain +implementations of the cipher, optimized for common machine architectures. +For the mathematically inclined, his specification for the cipher can be +found [here](http://cr.yp.to/snuffle/spec.pdf). + +The repository referenced above contains three implementations of the Salsa20 +cipher: A reference Cryptol implementation (which we take as correct in this +example), and two C implementations, one of which is from Bernstein himself. +For this example, we focus on the second of these C +implementations, which more closely matches the Cryptol implementation. Full +verification of Bernstein's implementation is available in +`examples/salsa20/djb`, for the interested. The code for this verification task +can be found in the files named according to the pattern +`examples/salsa20/(s|S)alsa20.*`. + +### Specifications + +We now take on the actual verification task. This will be done in two stages: +We first define some useful utility functions for constructing common patterns +in the specifications for this type of program (i.e. one where the arguments to +functions are modified in-place.) We then demonstrate how one might construct a +specification for each of the functions in the Salsa20 implementation described +above. + +#### Utility Functions + +We first define the function +`alloc_init : LLVMType -> Term -> LLVMSetup SetupValue`. + +`alloc_init ty v` returns a `SetupValue` consisting of a pointer to memory +allocated and initialized to a value `v` of type `ty`. `alloc_init_readonly` +does the same, except the memory allocated cannot be written to. + +:::{code-block} sawscript +import "Salsa20.cry"; + +let alloc_init ty v = do { + p <- llvm_alloc ty; + llvm_points_to p (llvm_term v); + return p; +}; + +let alloc_init_readonly ty v = do { + p <- llvm_alloc_readonly ty; + llvm_points_to p (llvm_term v); + return p; +}; +::: + +We now define +`ptr_to_fresh : String -> LLVMType -> LLVMSetup (Term, SetupValue)`. + +`ptr_to_fresh n ty` returns a pair `(x, p)` consisting of a fresh symbolic +variable `x` of type `ty` and a pointer `p` to it. `n` specifies the +name that SAW should use when printing `x`. `ptr_to_fresh_readonly` does the +same, but returns a pointer to space that cannot be written to. + +:::{code-block} sawscript +let ptr_to_fresh n ty = do { + x <- llvm_fresh_var n ty; + p <- alloc_init ty x; + return (x, p); +}; + +let ptr_to_fresh_readonly n ty = do { + x <- llvm_fresh_var n ty; + p <- alloc_init_readonly ty x; + return (x, p); +}; +::: + +Finally, we define +`oneptr_update_func : String -> LLVMType -> Term -> LLVMSetup ()`. + +`oneptr_update_func n ty f` specifies the behavior of a function that takes +a single pointer (with a printable name given by `n`) to memory containing a +value of type `ty` and mutates the contents of that memory. The specification +asserts that the contents of this memory after execution are equal to the value +given by the application of `f` to the value in that memory before execution. + +:::{code-block} sawscript +let oneptr_update_func n ty f = do { + (x, p) <- ptr_to_fresh n ty; + llvm_execute_func [p]; + llvm_points_to p (llvm_term {{ f x }}); +}; +::: + +#### The `quarterround` operation + +The C function we wish to verify has type +`void s20_quarterround(uint32_t *y0, uint32_t *y1, uint32_t *y2, uint32_t *y3)`. + +The function's specification generates four symbolic variables and pointers to +them in the precondition/setup stage. The pointers are passed to the function +during symbolic execution via `llvm_execute_func`. Finally, in the +postcondition/return stage, the expected values are computed using the trusted +Cryptol implementation and it is asserted that the pointers do in fact point to +these expected values. + +:::{code-block} sawscript +let quarterround_setup : LLVMSetup () = do { + (y0, p0) <- ptr_to_fresh "y0" (llvm_int 32); + (y1, p1) <- ptr_to_fresh "y1" (llvm_int 32); + (y2, p2) <- ptr_to_fresh "y2" (llvm_int 32); + (y3, p3) <- ptr_to_fresh "y3" (llvm_int 32); + + llvm_execute_func [p0, p1, p2, p3]; + + let zs = {{ quarterround [y0,y1,y2,y3] }}; // from Salsa20.cry + llvm_points_to p0 (llvm_term {{ zs@0 }}); + llvm_points_to p1 (llvm_term {{ zs@1 }}); + llvm_points_to p2 (llvm_term {{ zs@2 }}); + llvm_points_to p3 (llvm_term {{ zs@3 }}); +}; +::: + +#### Simple Updating Functions + +The following functions can all have their specifications given by the utility +function `oneptr_update_func` implemented above, so there isn't much to say +about them. + +:::{code-block} sawscript +let rowround_setup = + oneptr_update_func "y" (llvm_array 16 (llvm_int 32)) {{ rowround }}; + +let columnround_setup = + oneptr_update_func "x" (llvm_array 16 (llvm_int 32)) {{ columnround }}; + +let doubleround_setup = + oneptr_update_func "x" (llvm_array 16 (llvm_int 32)) {{ doubleround }}; + +let salsa20_setup = + oneptr_update_func "seq" (llvm_array 64 (llvm_int 8)) {{ Salsa20 }}; +::: + +#### 32-Bit Key Expansion + +The next function of substantial behavior that we wish to verify has the +following prototype: + +:::{code-block} c +void s20_expand32( uint8_t *k + , uint8_t n[static 16] + , uint8_t keystream[static 64] + ) +::: + +This function's specification follows a similar pattern to that of +`s20_quarterround`, though for extra assurance we can make sure that the +function does not write to the memory pointed to by `k` or `n` using the +utility `ptr_to_fresh_readonly`, as this function should only modify +`keystream`. Besides this, we see the call to the trusted Cryptol +implementation specialized to `a=2`, which does 32-bit key expansion (since the +Cryptol implementation can also specialize to `a=1` for 16-bit keys). This +specification can easily be changed to work with 16-bit keys. + +:::{code-block} sawscript +let salsa20_expansion_32 = do { + (k, pk) <- ptr_to_fresh_readonly "k" (llvm_array 32 (llvm_int 8)); + (n, pn) <- ptr_to_fresh_readonly "n" (llvm_array 16 (llvm_int 8)); + + pks <- llvm_alloc (llvm_array 64 (llvm_int 8)); + + llvm_execute_func [pk, pn, pks]; + + let rks = {{ Salsa20_expansion`{a=2}(k, n) }}; + llvm_points_to pks (llvm_term rks); +}; +::: + +#### 32-bit Key Encryption + +Finally, we write a specification for the encryption function itself, which has +type + +:::{code-block} c +enum s20_status_t s20_crypt32( uint8_t *key + , uint8_t nonce[static 8] + , uint32_t si + , uint8_t *buf + , uint32_t buflen + ) +::: + +As before, we can ensure this function does not modify the memory pointed to by +`key` or `nonce`. We take `si`, the stream index, to be 0. The specification is +parameterized on a number `n`, which corresponds to `buflen`. Finally, to deal +with the fact that this function returns a status code, we simply specify that +we expect a success (status code 0) as the return value in the postcondition +stage of the specification. + +:::{code-block} sawscript +let s20_encrypt32 n = do { + (key, pkey) <- ptr_to_fresh_readonly "key" (llvm_array 32 (llvm_int 8)); + (v, pv) <- ptr_to_fresh_readonly "nonce" (llvm_array 8 (llvm_int 8)); + (m, pm) <- ptr_to_fresh "buf" (llvm_array n (llvm_int 8)); + + llvm_execute_func [ pkey + , pv + , llvm_term {{ 0 : [32] }} + , pm + , llvm_term {{ `n : [32] }} + ]; + + llvm_points_to pm (llvm_term {{ Salsa20_encrypt (key, v, m) }}); + llvm_return (llvm_term {{ 0 : [32] }}); +}; +::: + +### Verifying Everything + +Finally, we can verify all of the functions. Notice the use of compositional +verification and that path satisfiability checking is enabled for those +functions with loops not bounded by explicit constants. Notice that we prove +the top-level function for several sizes; this is due to the limitation that +SAW can only operate on finite programs (while Salsa20 can operate on any input +size.) + +:::{code-block} sawscript +let main : TopLevel () = do { + m <- llvm_load_module "salsa20.bc"; + qr <- llvm_verify m "s20_quarterround" [] false quarterround_setup abc; + rr <- llvm_verify m "s20_rowround" [qr] false rowround_setup abc; + cr <- llvm_verify m "s20_columnround" [qr] false columnround_setup abc; + dr <- llvm_verify m "s20_doubleround" [cr,rr] false doubleround_setup abc; + s20 <- llvm_verify m "s20_hash" [dr] false salsa20_setup abc; + s20e32 <- llvm_verify m "s20_expand32" [s20] true salsa20_expansion_32 abc; + s20encrypt_63 <- llvm_verify m "s20_crypt32" [s20e32] true (s20_encrypt32 63) abc; + s20encrypt_64 <- llvm_verify m "s20_crypt32" [s20e32] true (s20_encrypt32 64) abc; + s20encrypt_65 <- llvm_verify m "s20_crypt32" [s20e32] true (s20_encrypt32 65) abc; + + print "Done!"; +}; +::: + +[^4]: + +## Verifying Cryptol FFI functions + +SAW has special support for verifying the correctness of Cryptol's +[`foreign` +functions](https://galoisinc.github.io/cryptol/master/FFI.html), +implemented in a language such as C which compiles to LLVM, provided +that there exists a [reference Cryptol +implementation](https://galoisinc.github.io/cryptol/master/FFI.html#cryptol-implementation-of-foreign-functions) +of the function as well. Since the way in which `foreign` functions are +called is precisely specified by the Cryptol FFI, SAW is able to +generate a `LLVMSetup ()` spec directly from the type of a Cryptol +`foreign` function. This is done with the `llvm_ffi_setup` command, +which is experimental and requires `enable_experimental;` to be run +beforehand. + +- `llvm_ffi_setup : Term -> LLVMSetup ()` + +For instance, for the simple imported Cryptol foreign function `foreign +add : [32] -> [32] -> [32]` we can obtain a `LLVMSetup` spec simply by +writing + +:::{code-block} sawscript +let add_setup = llvm_ffi_setup {{ add }}; +::: + +which behind the scenes expands to something like + +:::{code-block} sawscript +let add_setup = do { + in0 <- llvm_fresh_var "in0" (llvm_int 32); + in1 <- llvm_fresh_var "in1" (llvm_int 32); + llvm_execute_func [llvm_term in0, llvm_term in1]; + llvm_return (llvm_term {{ add in0 in1 }}); +}; +::: + +### Polymorphism + +In general, Cryptol `foreign` functions can be polymorphic, with type +parameters of kind `#`, representing e.g. the sizes of input sequences. +However, any individual `LLVMSetup ()` spec only specifies the behavior +of the LLVM function on inputs of concrete sizes. We handle this by +allowing the argument term of `llvm_ffi_setup` to contain any necessary +type arguments in addition to the Cryptol function name, so that the +resulting term is monomorphic. The user can then define a parameterized +specification simply as a SAWScript function in the usual way. For +example, for a function `foreign f : {n, m} (fin n, fin m) => [n][32] -> +[m][32]`, we can obtain a parameterized `LLVMSetup` spec by + +:::{code-block} sawscript +let f_setup (n : Int) (m : Int) = llvm_ffi_setup {{ f`{n, m} }}; +::: + +Note that the `Term` parameter that `llvm_ffi_setup` takes is restricted +syntactically to the format described above (``{{ fun`{tyArg0, tyArg1, +..., tyArgN} }}``), and cannot be any arbitrary term. + +### Supported types + +`llvm_ffi_setup` supports all Cryptol types that are supported by the +Cryptol FFI, with the exception of `Integer`, `Rational`, `Z`, and +`Float`. `Integer`, `Rational`, and `Z` are not supported since they are +translated to `gmp` arbitrary-precision types which are hard for SAW to +handle without additional overrides. There is no fundamental obstacle to +supporting `Float`, and in fact `llvm_ffi_setup` itself does work with +Cryptol floating point types, but the underlying functions such as +`llvm_fresh_var` do not, so until that is implemented `llvm_ffi_setup` +can generate a spec involving floating point types but it cannot +actually be run. + +### Performing the verification + +The resulting `LLVMSetup ()` spec can be used with the existing +`llvm_verify` function to perform the actual verification. And the +`LLVMSpec` output from that can be used as an override as usual for +further compositional verification. + +:::{code-block} sawscript +f_ov <- llvm_verify mod "f" [] true (f_setup 3 5) z3; +::: + +As with the Cryptol FFI itself, SAW does not manage the compilation of +the C source implementations of `foreign` functions to LLVM bitcode. For +the verification to be meaningful, is expected that the LLVM module +passed to `llvm_verify` matches the compiled dynamic library actually +used with the Cryptol interpreter. Alternatively, on x86_64 Linux, SAW +can perform verification directly on the `.so` ELF file with the +experimental `llvm_verify_x86` command. diff --git a/doc/saw-user-manual/structure-of-sawscript.md b/doc/saw-user-manual/structure-of-sawscript.md new file mode 100644 index 0000000000..7dc72dd9fb --- /dev/null +++ b/doc/saw-user-manual/structure-of-sawscript.md @@ -0,0 +1,431 @@ +# Structure of SAWScript + +A SAWScript program consists, at the top level, of a sequence of +commands to be executed in order. Each command is terminated with a +semicolon. For example, the `print` command displays a textual +representation of its argument. Suppose the following text is stored in +the file `print.saw`: + +:::{code-block} sawscript +print 3; +::: + +The command `saw print.saw` will then yield output similar to the +following: + +:::{code-block} console +Loading module Cryptol +Loading file "print.saw" +3 +::: + +The same code can be run from the interactive REPL: + +:::{code-block} console +sawscript> print 3; +3 +::: + +At the REPL, terminating semicolons can be omitted: + +:::{code-block} console +sawscript> print 3 +3 +::: + +To make common use cases simpler, bare values at the REPL are treated as +if they were arguments to `print`: + +:::{code-block} console +sawscript> 3 +3 +::: + +One SAWScript file can be included in another using the `include` +command, which takes the name of the file to be included as an argument. +For example: + +:::{code-block} console +sawscript> include "print.saw" +Loading file "print.saw" +3 +::: + +Typically, included files are used to import definitions, not perform +side effects like printing. However, as you can see, if any commands +with side effects occur at the top level of the imported file, those +side effects will occur during import. + +## Syntax + +The syntax of SAWScript is reminiscent of functional languages such as +Cryptol, Haskell and ML. In particular, functions are applied by +writing them next to their arguments rather than by using parentheses +and commas. Rather than writing `f(x, y)`, write `f x y`. + +Comments are written as in C, Java, and Rust (among many other languages). All +text from `//` until the end of a line is ignored. Additionally, all +text between `/*` and `*/` is ignored, regardless of whether the line +ends. + +## Basic Types and Values + +All values in SAWScript have types, and these types are determined and +checked before a program runs (that is, SAWScript is statically typed). +The basic types available are similar to those in many other languages. + +- The `Int` type represents unbounded mathematical integers. Integer + constants can be written in decimal notation (e.g., `42`), hexadecimal + notation (`0x2a`), and binary (`0b00101010`). However, unlike many + languages, integers in SAWScript are used primarily as constants. + Arithmetic is usually encoded in Cryptol, as discussed in the next + section. + +- The Boolean type, `Bool`, contains the values `true` and `false`, like + in many other languages. As with integers, computations on Boolean + values usually occur in Cryptol. + +- Values of any type can be aggregated into tuples. For example, the + value `(true, 10)` has the type `(Bool, Int)`. + +- Values of any type can also be aggregated into records, which are + exactly like tuples except that their components have names. For + example, the value `{ b = true, n = 10 }` has the type `{ b : Bool, + n : Int }`. + +- A sequence of values of the same type can be stored in a list. For + example, the value `[true, false, true]` has the type `[Bool]`. + +- Strings of textual characters can be represented in the `String` type. + For example, the value `"example"` has type `String`. + +- The "unit" type, written `()`, is essentially a placeholder, similar + to `void` in languages like C and Java. It has only one value, also + written `()`. Values of type `()` convey no information. We will show + in later sections several cases where this is useful. + +- Functions are given types that indicate what type they consume and + what type they produce. For example, the type `Int -> Bool` indicates + a function that takes an `Int` as input and produces a `Bool` as + output. Functions with multiple arguments use multiple arrows. For + example, the type `Int -> String -> Bool` indicates a function in + which the first argument is an `Int`, the second is a `String`, and + the result is a `Bool`. It is possible, but not necessary, to group + arguments in tuples, as well, so the type `(Int, String) -> Bool` + describes a function that takes one argument, a pair of an `Int` and a + `String`, and returns a `Bool`. + +SAWScript also includes some more specialized types that do not have +straightforward counterparts in most other languages. These will appear +in later sections. + +## Basic Expression Forms + +One of the key forms of top-level command in SAWScript is a _binding_, +introduced with the `let` keyword, which gives a name to a value. For +example: + +:::{code-block} console +sawscript> let x = 5 +sawscript> x +5 +::: + +Bindings can have parameters, in which case they define functions. For +instance, the following function takes one parameter and constructs a +list containing that parameter as its single element. + +:::{code-block} console +sawscript> let f x = [x] +sawscript> f "text" +["text"] +::: + +Functions themselves are values and have types. The type of a function +that takes an argument of type `a` and returns a result of type `b` is +`a -> b`. + +Function types are typically inferred, as in the example `f` +above. In this case, because `f` only creates a list with the given +argument, and because it is possible to create a list of any element +type, `f` can be applied to an argument of any type. We say, therefore, +that `f` is _polymorphic_. Concretely, we write the type of `f` as `{a} +a -> [a]`, meaning it takes a value of any type (denoted `a`) and +returns a list containing elements of that same type. This means we can +also apply `f` to `10`: + +:::{code-block} console +sawscript> f 10 +[10] +::: + +However, we may want to specify that a function has a more +specific type. In this case, we could restrict `f` to operate only on +`Int` parameters. + +:::{code-block} console +sawscript> let f (x : Int) = [x] +::: + +This will work identically to the original `f` on an `Int` parameter: + +:::{code-block} console +sawscript> f 10 +[10] +::: + +However, it will fail for a `String` parameter: + +:::{code-block} console +sawscript> f "text" + +type mismatch: String -> t.0 and Int -> [Int] + at "_" (REPL) +mismatched type constructors: String and Int +::: + +Type annotations can be applied to any expression. The notation `(e : +t)` indicates that expression `e` is expected to have type `t` and +that it is an error for `e` to have a different type. Most types in +SAWScript are inferred automatically, but specifying them explicitly can +sometimes enhance readability. + +Because functions are values, functions can return other functions. We +make use of this feature when writing functions of multiple arguments. +Consider the function `g`, similar to `f` but with two arguments: + +:::{code-block} console +sawscript> let g x y = [x, y] +::: + +Like `f`, `g` is polymorphic. Its type is `{a} a -> a -> [a]`. This +means it takes an argument of type `a` and returns a _function_ that +takes an argument of the same type `a` and returns a list of `a` values. +We can therefore apply `g` to any two arguments of the same type: + +:::{code-block} console +sawscript> g 2 3 +[2,3] +sawscript> g true false +[true,false] +::: + +But type checking will fail if we apply it to two values of different types: + +:::{code-block} console +sawscript> g 2 false + +type mismatch: Bool -> t.0 and Int -> [Int] + at "_" (REPL) +mismatched type constructors: Bool and Int +::: + +So far we have used two related terms, _function_ and _command_, and we +take these to mean slightly different things. A function is any value +with a function type (e.g., `Int -> [Int]`). A command is any value with +a special command type (e.g. `TopLevel ()`, as shown below). These +special types allow us to restrict command usage to specific contexts, +and are also _parameterized_ (like the list type). Most but not all +commands are also functions. + +The most important command type is the `TopLevel` type, indicating a +command that can run at the top level (directly at the REPL, or as one +of the top level commands in a script file). The `print` command +has the type `{a} a -> TopLevel ()`, where `TopLevel ()` means that it +is a command that runs in the `TopLevel` context and returns a value of +type `()` (that is, no useful information). In other words, it has a +side effect (printing some text to the screen) but doesn't produce any +information to use in the rest of the SAWScript program. This is the +primary usage of the `()` type. + +It can sometimes be useful to bind a sequence of commands together. This +can be accomplished with the `do { ... }` construct. For example: + +:::{code-block} console +sawscript> let print_two = do { print "first"; print "second"; } +sawscript> print_two +first +second +::: + +The bound value, `print_two`, has type `TopLevel ()`, since that is the +type of its last command. + +Note that in the previous example the printing doesn't occur until +`print_two` directly appears at the REPL. The `let` expression does not +cause those commands to run. The construct that _runs_ a command is +written using the `<-` operator. This operator works like `let` except +that it says to run the command listed on the right hand side and bind +the result, rather than binding the variable to the command itself. +Using `<-` instead of `let` in the previous example yields: + +:::{code-block} console +sawscript> print_two <- do { print "first"; print "second"; } +first +second +sawscript> print print_two +() +::: + +Here, the `print` commands run first, and then `print_two` gets the +value returned by the second `print` command, namely `()`. Any command +run without using `<-` at either the top level of a script or within a +`do` block discards its result. However, the REPL prints the result of +any command run without using the `<-` operator. + +In some cases it can be useful to have more control over the value +returned by a `do` block. The `return` command allows us to do this. For +example, say we wanted to write a function that would print a message +before and after running some arbitrary command and then return the +result of that command. We could write: + +:::{code-block} sawscript +let run_with_message msg c = + do { + print "Starting."; + print msg; + res <- c; + print "Done."; + return res; + }; + +x <- run_with_message "Hello" (return 3); +print x; +::: + +If we put this script in `run.saw` and run it with `saw`, we get +something like: + +:::{code-block} console +Loading module Cryptol +Loading file "run.saw" +Starting. +Hello +Done. +3 +::: + +Note that it ran the first `print` command, then the caller-specified +command, then the second `print` command. The result stored in `x` at +the end is the result of the `return` command passed in as an argument. + +## Other Basic Functions + +Aside from the functions we have listed so far, there are a number of other +operations for working with basic data structures and interacting +with the operating system. + +The following functions work on lists: + +- `concat : {a} [a] -> [a] -> [a]` takes two lists and returns the +concatenation of the two. + +- `head : {a} [a] -> a` returns the first element of a list. + +- `tail : {a} [a] -> [a]` returns everything except the first element. + +- `length : {a} [a] -> Int` counts the number of elements in a list. + +- `null : {a} [a] -> Bool` indicates whether a list is empty (has zero +elements). + +- `nth : {a} [a] -> Int -> a` returns the element at the given position, +with `nth l 0` being equivalent to `head l`. + +- `for : {m, a, b} [a] -> (a -> m b) -> m [b]` takes a list and a +function that runs in some command context. The passed command will be +called once for every element of the list, in order. Returns a list of +all of the results produced by the command. + +For interacting with the operating system, we have: + +- `get_opt : Int -> String` returns the command-line argument to `saw` +at the given index. Argument 0 is always the name of the `saw` +executable itself, and higher indices represent later arguments. + +- `exec : String -> [String] -> String -> TopLevel String` runs an +external program given, respectively, an executable name, a list of +arguments, and a string to send to the standard input of the program. +The `exec` command returns the standard output from the program it +executes and prints standard error to the screen. + +- `exit : Int -> TopLevel ()` stops execution of the current script and +returns the given exit code to the operating system. + +Finally, there are a few miscellaneous functions and commands: + +- `show : {a} a -> String` computes the textual representation of its +argument in the same way as `print`, but instead of displaying the value +it returns it as a `String` value for later use in the program. This can +be useful for constructing more detailed messages later. + +- `str_concat : String -> String -> String` concatenates two `String` +values, and can also be useful with `show`. + +- `time : {a} TopLevel a -> TopLevel a` runs any other `TopLevel` +command and prints out the time it took to execute. + +- `with_time : {a} TopLevel a -> TopLevel (Int, a)` returns both the +original result of the timed command and the time taken to execute it +(in milliseconds), without printing anything in the process. + +## REPL Actions + +There is an additional class of things that one may type at the REPL +for interactive use: + +- `:cd` changes the REPL's current directory. + +- `:pwd` prints the REPL's current directory. + +- `:env` displays the values and types of all currently bound +variables, including built-in functions and commands. + +- `:tenv` displays the expansions of all currently defined type +aliases, including those that are built in. + +- `:type` or `:t` checks and prints the type of an arbitrary SAWScript +expression: + +:::{code-block} console +sawscript> :t show +{a.0} a.0 -> String +::: + +- `:help` or `:h` prints the help text for a built-in function or +command: + +:::{code-block} console +sawscript> :h show + +Description +----------- + + show : {a} a -> String + +Convert the value of the given expression to a string. +::: + +- `:quit` or `:q` exits the program. + +## Further built-in functions and commands + +SAW contains many built-in operations, referred to as "primitives." +These appear in SAWScript as built-in functions and commands. +The following sections of the manual will introduce many of these. + +## Experimental and deprecated functions and commands + +Some of the primitives available in SAW at any given time are +experimental. +These may be incomplete, unfinished, use at your own risk, etc. +The functions and commands associated with these are unavailable +by default; they can be made visible with the `enable_experimental` +command. + +Other primitives are considered deprecated. +Some of these, as the +[deprecation process](formal-deprecation-process.md) proceeds, are +unavailable by default. + +They can be made visible with the `enable_deprecated` command. diff --git a/doc/saw-user-manual/symbolic-execution.md b/doc/saw-user-manual/symbolic-execution.md new file mode 100644 index 0000000000..8f5305e325 --- /dev/null +++ b/doc/saw-user-manual/symbolic-execution.md @@ -0,0 +1,92 @@ +# Symbolic Execution + +Analysis of Java and LLVM within SAWScript relies heavily on *symbolic +execution*, so some background on how this process works can help with +understanding the behavior of the available built-in functions. + +At the most abstract level, symbolic execution works like normal program +execution except that the values of all variables within the program can be +arbitrary *expressions*, potentially containing free variables, rather than +concrete values. Therefore, each symbolic execution corresponds to some set +of possible concrete executions. + +As a concrete example, consider the following C program that returns +the maximum of two values: + +:::{code-block} c +unsigned int max(unsigned int x, unsigned int y) { + if (y > x) { + return y; + } else { + return x; + } +} +::: + +If you call this function with two concrete inputs, like this: + +:::{code-block} c +int r = max(5, 4); +::: + +then it will assign the value `5` to `r`. However, we can also consider what +it will do for *arbitrary* inputs. Consider the following example: + +:::{code-block} c +int r = max(a, b); +::: + +where `a` and `b` are variables with unknown values. It is still +possible to describe the result of the `max` function in terms of `a` +and `b`. The following expression describes the value of `r`: + +:::{code-block} text +ite (b > a) b a +::: + +where `ite` is the "if-then-else" mathematical function, which based on +the value of the first argument returns either the second or third. One +subtlety of constructing this expression, however, is the treatment of +conditionals in the original program. For any concrete values of `a` and +`b`, only one branch of the `if` statement will execute. During symbolic +execution, on the other hand, it is necessary to execute *both* +branches, track two different program states (each composed of symbolic +values), and then *merge* those states after executing the `if` +statement. This merging process takes into account the original branch +condition and introduces the `ite` expression. + +A symbolic execution system, then, is very similar to an interpreter +that has a different notion of what constitutes a value and executes +*all* paths through the program instead of just one. Therefore, the +execution process is similar to that of a normal +interpreter, and the process of generating a model for a piece of code +is similar to building a test harness for that same code. + +More specifically, the setup process for a test harness typically takes +the following form: + +1. Initialize or allocate any resources needed by the code. For Java and + LLVM code, this typically means allocating memory and setting the + initial values of variables. + +2. Execute the code. + +3. Check the desired properties of the system state after the code + completes. + +Accordingly, three pieces of information are particularly relevant to +the symbolic execution process, and are therefore needed as input to the +symbolic execution system: + +- The initial (potentially symbolic) state of the system. + +- The code to execute. + +- The final state of the system, and which parts of it are relevant to + the properties being tested. + +In the following sections, we describe how the Java and LLVM analysis +primitives work in the context of these key concepts. We start with the +simplest situation, in which the structure of the initial and final +states can be directly inferred, and move on to more complex cases that +require more information from the user. diff --git a/doc/saw-user-manual/symbolic-termination.md b/doc/saw-user-manual/symbolic-termination.md new file mode 100644 index 0000000000..fdc9175ae3 --- /dev/null +++ b/doc/saw-user-manual/symbolic-termination.md @@ -0,0 +1,66 @@ +# Symbolic Termination + +Above we described the process of executing multiple branches and +merging the results when encountering a conditional statement in the +program. When a program contains loops, the branch that chooses to +continue or terminate a loop could go either way. Therefore, without a +bit more information, the most obvious implementation of symbolic +execution would never terminate when executing programs that contain +loops. + +The solution to this problem is to analyze the branch condition whenever +considering multiple branches. If the condition for one branch can never +be true in the context of the current symbolic state, there is no reason +to execute that branch, and skipping it can make it possible for +symbolic execution to terminate. + +Directly comparing the branch condition to a constant can sometimes be +enough to ensure termination. For example, in simple, bounded loops like +the following, comparison with a constant is sufficient. + +:::{code-block} c +for (int i = 0; i < 10; i++) { + // do something +} +::: + +In this case, the value of `i` is always concrete, and will eventually +reach the value `10`, at which point the branch corresponding to +continuing the loop will be infeasible. + +As a more complex example, consider the following function: + +:::{code-block} c +uint8_t f(uint8_t i) { + int done = 0; + while (!done) { + if (i % 8 == 0) done = 1; + i += 5; + } + return i; +} +::: + +The loop in this function can only be determined to symbolically +terminate if the analysis takes into account algebraic rules about +common multiples. Similarly, it can be difficult to prove that a base +case is eventually reached for all inputs to a recursive program. + +In this particular case, however, the code _is_ guaranteed to terminate +after a fixed number of iterations (where the number of possible +iterations is a function of the number of bits in the integers being +used). To show that the last iteration is in fact the last possible one, +it's necessary to do more than just compare the branch condition with a +constant. Instead, we can use the same proof tools that we use to +ultimately analyze the generated models to, early in the process, prove +that certain branch conditions can never be true (i.e., are +_unsatisfiable_). + +Normally, most of the Java and LLVM analysis commands simply compare +branch conditions to the constant `True` or `False` to determine whether +a branch may be feasible. However, each form of analysis allows branch +satisfiability checking to be turned on if needed, in which case +functions like `f` above will terminate. + +Next, we examine the details of the specific commands available to +analyze JVM and LLVM programs. diff --git a/doc/saw-user-manual/the-term-type.md b/doc/saw-user-manual/the-term-type.md new file mode 100644 index 0000000000..f0f2e76c18 --- /dev/null +++ b/doc/saw-user-manual/the-term-type.md @@ -0,0 +1,13 @@ +# The Term Type + +Perhaps the most important type in SAWScript, and the one most unlike +the built-in types of most other languages, is the `Term` type. +Essentially, a value of type `Term` precisely describes all +possible computations performed by some program. In particular, if +two `Term` values are *equivalent*, then the programs that they +represent will always compute the same results given the same inputs. We +will say more later about exactly what it means for two terms to be +equivalent, and how to determine whether two terms are equivalent. + +Before exploring the `Term` type more deeply, it is important to +understand the role of the Cryptol language in SAW. diff --git a/doc/saw-user-manual/transforming-term-values.md b/doc/saw-user-manual/transforming-term-values.md new file mode 100644 index 0000000000..8f153805a0 --- /dev/null +++ b/doc/saw-user-manual/transforming-term-values.md @@ -0,0 +1,315 @@ +# Transforming Term Values + +The three primary functions of SAW are _extracting_ models (`Term` +values) from programs, _transforming_ those models, and _proving_ +properties about models using external provers. So far we've shown how +to construct `Term` values from Cryptol programs; later sections +will describe how to extract them from other programs. Now we show how +to use the various term transformation features available in SAW. + +## Rewriting + +Rewriting a `Term` consists of applying one or more _rewrite rules_ to +it, resulting in a new `Term`. A rewrite rule in SAW can be specified in +multiple ways: + +- as the definition of a function that can be unfolded, +- as a term of Boolean type (or a function returning a Boolean) that + is an equality statement, and +- as a term of _equality type_ with a body that encodes a proof that + the equality in the type is valid. + +In each case the term logically consists of two sides and describes a +way to transform the left side into the right side. Each side may +contain variables (bound by enclosing lambda expressions) and is +therefore a _pattern_ which can match any term in which each variable +represents an arbitrary sub-term. The left-hand pattern describes a term +to match (which may be a sub-term of the full term being rewritten), and +the right-hand pattern describes a term to replace it with. Any variable +in the right-hand pattern must also appear in the left-hand pattern and +will be instantiated with whatever sub-term matched that variable in the +original term. + +For example, say we have the following Cryptol function: + +:::{code-block} cryptol + \(x:[8]) -> (x * 2) + 1 +::: + +We might for some reason want to replace multiplication by a power of +two with a shift. We can describe this replacement using an equality +statement in Cryptol (a rule of form 2 above): + +:::{code-block} cryptol +\(y:[8]) -> (y * 2) == (y << 1) +::: + +Interpreting this as a rewrite rule, it says that for any 8-bit vector +(call it `y` for now), we can replace `y * 2` with `y << 1`. Using this +rule to rewrite the earlier expression would then yield: + +:::{code-block} cryptol + \(x:[8]) -> (x << 1) + 1 +::: + +The general philosophy of rewriting is that the left and right patterns, +while syntactically different, should be semantically equivalent. +Therefore, applying a set of rewrite rules should not change the +fundamental meaning of the term being rewritten. SAW is particularly +focused on the task of proving that some logical statement expressed as +a `Term` is always true. If that is in fact the case, then the entire +term can be replaced by the term `True` without changing its meaning. The +rewriting process can in some cases, by repeatedly applying rules that +themselves are known to be valid, reduce a complex term entirely to +`True`, which constitutes a proof of the original statement. In other +cases, rewriting can simplify terms before sending them to external +automated provers that can then finish the job. Sometimes this +simplification can help the automated provers run more quickly, and +sometimes it can help them prove things they would otherwise be unable +to prove by applying reasoning steps (rewrite rules) that are not +available to the automated provers. + +In practical use, rewrite rules can be aggregated into `Simpset` +values in SAWScript. A few pre-defined `Simpset` values exist: + +- `empty_ss : Simpset` is the empty set of rules. Rewriting with it +should have no effect, but it is useful as an argument to some of the +functions that construct larger `Simpset` values. + +- `basic_ss : Simpset` is a collection of rules that are useful in most +proof scripts. + +- `cryptol_ss : () -> Simpset` includes a collection of Cryptol-specific +rules. Some of these simplify away the abstractions introduced in the +translation from Cryptol to SAWCore, which can be useful when proving +equivalence between Cryptol and non-Cryptol code. Leaving these +abstractions in place is appropriate when comparing only Cryptol code, +however, so `cryptol_ss` is not included in `basic_ss`. + +The next set of functions can extend or apply a `Simpset`: + +- `addsimp' : Term -> Simpset -> Simpset` adds a single `Term` to an +existing `Simpset. + +- `addsimps' : [Term] -> Simpset -> Simpset` adds a list of `Term`s to +an existing `Simpset`. + +- `rewrite : Simpset -> Term -> Term` applies a `Simpset` to an existing +`Term` to produce a new `Term`. + +To make this more concrete, we examine how the rewriting example +sketched above, to convert multiplication into shift, can work in +practice. We simplify everything with `cryptol_ss` as we go along so +that the `Term`s don't get too cluttered. First, we declare the term +to be transformed: + +:::{code-block} console +sawscript> let term = rewrite (cryptol_ss ()) {{ \(x:[8]) -> (x * 2) + 1 }} +sawscript> print_term term +\(x : Prelude.Vec 8 Prelude.Bool) -> + Prelude.bvAdd 8 (Prelude.bvMul 8 x (Prelude.bvNat 8 2)) + (Prelude.bvNat 8 1) +::: + +Next, we declare the rewrite rule: + +:::{code-block} console +sawscript> let rule = rewrite (cryptol_ss ()) {{ \(y:[8]) -> (y * 2) == (y << 1) }} +sawscript> print_term rule +let { x@1 = Prelude.Vec 8 Prelude.Bool + } + in \(y : x@1) -> + Cryptol.ecEq x@1 (Cryptol.PCmpWord 8) + (Prelude.bvMul 8 y (Prelude.bvNat 8 2)) + (Prelude.bvShiftL 8 Prelude.Bool 1 Prelude.False y + (Prelude.bvNat 1 1)) +::: + +Finally, we apply the rule to the target term: + +:::{code-block} console +sawscript> let result = rewrite (addsimp' rule empty_ss) term +sawscript> print_term result +\(x : Prelude.Vec 8 Prelude.Bool) -> + Prelude.bvAdd 8 + (Prelude.bvShiftL 8 Prelude.Bool 1 Prelude.False x + (Prelude.bvNat 1 1)) + (Prelude.bvNat 8 1) +::: + +Note that `addsimp'` and `addsimps'` take a `Term` or list of `Term`s; +these could in principle be anything, and are not necessarily terms +representing logically valid equalities. They have `'` suffixes because +they are not intended to be the primary interface to rewriting. When using +these functions, the soundness of the proof process depends on the +correctness of these rules as a side condition. + +The primary interface to rewriting uses the `Theorem` type instead of +the `Term` type, as shown in the signatures for `addsimp` and +`addsimps`. + +- `addsimp : Theorem -> Simpset -> Simpset` adds a single `Theorem` to a +`Simpset`. + +- `addsimps : [Theorem] -> Simpset -> Simpset` adds several `Theorem` +values to a `Simpset`. + +A `Theorem` is essentially a `Term` that is proven correct in some way. +In general, a `Theorem` can be any statement, and may not be useful as a +rewrite rule. However, if it has an appropriate shape it can be used for +rewriting. In the ["Proofs about Terms"](proofs-about-terms.md) section, +we'll describe how to construct `Theorem` values from `Term` values. + +In the absence of user-constructed `Theorem` values, there are some +additional built-in rules that are not included in either `basic_ss` and +`cryptol_ss` because they are not always beneficial, but that can +sometimes be helpful or essential. The `cryptol_ss` simpset includes +rewrite rules to unfold all definitions in the `Cryptol` SAWCore module, +but does not include any of the terms of equality type. + +- `add_cryptol_defs : [String] -> Simpset -> Simpset` adds unfolding +rules for functions with the given names from the SAWCore `Cryptol` module +to the given `Simpset`. + +- `add_cryptol_eqs : [String] -> Simpset -> Simpset` adds the terms of +equality type with the given names from the SAWCore `Cryptol` module to +the given `Simpset`. + +- `add_prelude_defs : [String] -> Simpset -> Simpset` adds unfolding +rules from the SAWCore `Prelude` module to a `Simpset`. + +- `add_prelude_eqs : [String] -> Simpset -> Simpset` adds equality-typed +terms from the SAWCore `Prelude` module to a `Simpset`. + +Finally, it's possible to construct a theorem from an arbitrary SAWCore +expression (rather than a Cryptol expression), using the `core_axiom` +function. + +- `core_axiom : String -> Theorem` creates a `Theorem` from a `String` +in SAWCore syntax. Any `Theorem` introduced by this function is assumed +to be correct, so use it with caution. + +## Folding and Unfolding + +A SAWCore term can be given a name using the `define` function, and is +then by default printed as that name alone. A named subterm can be +"unfolded" so that the original definition appears again. + +- `define : String -> Term -> TopLevel Term` + +- `unfold_term : [String] -> Term -> Term` + +For example: + +:::{code-block} console +sawscript> let t = {{ 0x22 }} +sawscript> print_term t +Cryptol.ecNumber (Cryptol.TCNum 34) (Prelude.Vec 8 Prelude.Bool) + (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) +sawscript> t' <- define "t" t +sawscript> print_term t' +t +sawscript> let t'' = unfold_term ["t"] t' +sawscript> print_term t'' +Cryptol.ecNumber (Cryptol.TCNum 34) (Prelude.Vec 8 Prelude.Bool) + (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) +::: + +This process of folding and unfolding is useful both to make large terms +easier for humans to work with and to make automated proofs more +tractable. We'll describe the latter in more detail when we discuss +interacting with external provers. + +In some cases, folding happens automatically when constructing Cryptol +expressions. Consider the following example: + +:::{code-block} console +sawscript> let t = {{ 0x22 }} +sawscript> print_term t +Cryptol.ecNumber (Cryptol.TCNum 34) (Prelude.Vec 8 Prelude.Bool) + (Cryptol.PLiteralSeqBool (Cryptol.TCNum 8)) +sawscript> let {{ t' = 0x22 }} +sawscript> print_term {{ t' }} +t' +::: + +This illustrates that a bare expression in Cryptol braces gets +translated directly to a SAWCore term. However, a Cryptol _definition_ +gets translated into a _folded_ SAWCore term. In addition, because the +second definition of `t` occurs at the Cryptol level, rather than the +SAWScript level, it is visible only inside Cryptol braces. Definitions +imported from Cryptol source files are also initially folded and can be +unfolded as needed. + +## Other Built-in Transformation and Inspection Functions + +In addition to the `Term` transformation functions described so far, a +variety of others also exist. + +- `beta_reduce_term : Term -> Term` takes any sub-expression of the form +`(\x -> t) v` in the given `Term` and replaces it with a transformed +version of `t` in which all instances of `x` are replaced by `v`. + +- `replace : Term -> Term -> Term -> TopLevel Term` replaces arbitrary +subterms. A call to `replace x y t` replaces any instance of `x` inside +`t` with `y`. + +Assessing the size of a term can be particularly useful during benchmarking. +SAWScript provides two mechanisms for this. + +- `term_size : Term -> Int` calculates the number of nodes in the +Directed Acyclic Graph (DAG) representation of a `Term` used internally +by SAW. This is the most appropriate way of determining the resource use +of a particular term. + +- `term_tree_size : Term -> Int` calculates how large a `Term` would be +if it were represented by a tree instead of a DAG. This can, in general, +be much, much larger than the number returned by `term_size`, and serves +primarily as a way of assessing, for a specific term, how much benefit +there is to the term sharing used by the DAG representation. + +Finally, there are a few commands related to the internal SAWCore type of a +`Term`. + +- `check_term : Term -> TopLevel ()` checks that the internal structure +of a `Term` is well-formed and that it passes all of the rules of the +SAWCore type checker. + +- `type : Term -> Type` returns the type of a particular `Term`, which +can then be used to, for example, construct a new fresh variable with +`fresh_symbolic`. + +(loading-and-storing-terms)= +## Loading and Storing Terms + +Most frequently, `Term` values in SAWScript come from Cryptol, JVM, or +LLVM programs, or some transformation thereof. However, it is also +possible to obtain them from various other sources. + +- `parse_core : String -> Term` parses a `String` containing a term in +SAWCore syntax, returning a `Term`. + +- `read_core : String -> TopLevel Term` is like `parse_core`, but +obtains the text from the given file and expects it to be in the simpler +SAWCore external representation format, rather than the human-readable +syntax shown so far. + +- `read_aig : String -> TopLevel Term` returns a `Term` representation +of an And-Inverter-Graph (AIG) file in AIGER format. + +- `read_bytes : String -> TopLevel Term` reads a constant sequence of +bytes from a file and represents it as a `Term`. Its result will always +have Cryptol type `[n][8]` for some `n`. + +It is also possible to write `Term` values into files in various +formats, including: AIGER (`write_aig`), CNF (`write_cnf`), SAWCore +external representation (`write_core`), and SMT-Lib version 2 +(`write_smtlib2`). + +- `write_aig : String -> Term -> TopLevel ()` + +- `write_cnf : String -> Term -> TopLevel ()` + +- `write_core : String -> Term -> TopLevel ()` + +- `write_smtlib2 : String -> Term -> TopLevel ()` diff --git a/doc/scripts/requirements.txt b/doc/scripts/requirements.txt new file mode 100644 index 0000000000..1845bd3689 --- /dev/null +++ b/doc/scripts/requirements.txt @@ -0,0 +1,35 @@ +alabaster==1.0.0 +babel==2.16.0 +certifi==2024.12.14 +charset-normalizer==3.4.1 +docutils==0.21.2 +idna==3.10 +imagesize==1.4.1 +Jinja2==3.1.5 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +myst-parser==4.0.0 +packaging==24.2 +pip==24.3.1 +Pygments==2.19.1 +PyYAML==6.0.2 +requests==2.32.3 +snowballstemmer==2.2.0 +Sphinx==8.1.3 +sphinx-copybutton==0.5.2 +sphinx-hoverxref==1.4.2 +sphinx-notfound-page==1.0.4 +sphinx-rtd-theme==3.0.2 +sphinxcontrib-applehelp==2.0.0 +sphinxcontrib-devhelp==2.0.0 +sphinxcontrib-htmlhelp==2.1.0 +sphinxcontrib-jquery==4.1 +sphinxcontrib-jsmath==1.0.1 +sphinxcontrib-qthelp==2.0.0 +sphinxcontrib-serializinghtml==2.0.0 +urllib3==2.3.0 + +-e ./saw-lexer +-e ./sphinx-download-dir diff --git a/doc/scripts/setup_env.sh b/doc/scripts/setup_env.sh new file mode 100755 index 0000000000..89bfa94808 --- /dev/null +++ b/doc/scripts/setup_env.sh @@ -0,0 +1,49 @@ +#!/bin/sh + +# Set up a new or existing .venv directory as a Python virtual environment +# suitable for building SAW documentation, particularly with make_docs.sh. + +REQUIREMENTS="$(dirname $0)/requirements.txt" + +echo "Checking for python3..." +if ! [ -x "$(command -v python3)" ]; +then + echo "Error: python3 is not installed." >&2 + return 1 +fi + +if [ -d ".venv" ]; +then + echo "Found a '.venv' directory." + + if ! [ -e .venv/bin/activate ] + then + echo -e "\033[1;33mWARNING:\033[0m '.venv/bin/activate' not found." + echo "Running python3 -m venv .venv to repair..." + python3 -m venv .venv + fi + + # We want to check _the virtual environment's_ pip list, so activate here. + . .venv/bin/activate + + if pip freeze -r "${REQUIREMENTS}" 2>&1 | grep -q "not installed" + then + echo -e "\033[1;33mWARNING:\033[0m 'requirements.txt' not satisfied." + echo "Installing dependencies..." + pip install -qr "${REQUIREMENTS}" + fi + + echo -e "\033[1;32mEnvironment updated successfully!\033[0m" + exit +fi + +echo "No '.venv' directory found." +echo "Running python3 -m venv .venv..." +python3 -m venv .venv + +. .venv/bin/activate + +echo "Installing dependencies..." +pip install -qr "${REQUIREMENTS}" + +echo -e "\033[1;32mEnvironment created successfully!\033[0m" diff --git a/doc/sphinx-download-dir/pyproject.toml b/doc/sphinx-download-dir/pyproject.toml new file mode 100644 index 0000000000..5f3591ad76 --- /dev/null +++ b/doc/sphinx-download-dir/pyproject.toml @@ -0,0 +1,10 @@ +[build-system] +requires = [ + "setuptools", +] +build-backend = "setuptools.build_meta" + +[project] +name = "sphinx-download-dir" +version = "0.0.1" +dependencies = ["sphinx"] diff --git a/doc/sphinx-download-dir/sphinx_download_dir.py b/doc/sphinx-download-dir/sphinx_download_dir.py new file mode 100644 index 0000000000..9cd55b9588 --- /dev/null +++ b/doc/sphinx-download-dir/sphinx_download_dir.py @@ -0,0 +1,55 @@ +from pathlib import Path +import tarfile + +from docutils import nodes + +from sphinx.addnodes import download_reference +from sphinx.application import Sphinx + +from sphinx.util.docutils import ReferenceRole +from sphinx.util.typing import ExtensionMetadata + + +class DownloadDirRole(ReferenceRole): + """Role providing directory downloads. + + Injects a suitable download reference after making a tar.gz archive of the + directory. + """ + + def run(self) -> tuple[list[nodes.Node], list[nodes.system_message]]: + dir_to_download = Path(self.env.relfn2path(self.target)[1]) + + if not dir_to_download.is_dir(): + msg = self.inliner.reporter.error( + f"download-dir expects a source directory path, but {self.target} does not resolve to a directory -- Skipping." + ) + return [], [msg] + + out_tar = dir_to_download.with_suffix(".tar.gz") + self.env.note_dependency(str(out_tar)) + + with tarfile.open(out_tar, "w:gz") as ot: + for child in dir_to_download.iterdir(): + ot.add(child) + + download_node = download_reference( + reftarget=f"/{out_tar.relative_to(self.env.srcdir)}", + ) + download_node.append( + nodes.literal( + text=self.title if self.has_explicit_title else out_tar.name, + classes=["download"], + ) + ) + + return [download_node], [] + + +def setup(app: Sphinx) -> ExtensionMetadata: + app.add_role("download-dir", DownloadDirRole()) + + return { + "version": "0.0.1", + "parallel_read_safe": True, + } diff --git a/doc/tutorial/.gitignore b/doc/tutorial/.gitignore deleted file mode 100644 index a9a5aecf42..0000000000 --- a/doc/tutorial/.gitignore +++ /dev/null @@ -1 +0,0 @@ -tmp diff --git a/doc/tutorial/Makefile b/doc/tutorial/Makefile deleted file mode 100644 index dd3fc22139..0000000000 --- a/doc/tutorial/Makefile +++ /dev/null @@ -1,69 +0,0 @@ -TARGET = tmp/sawScriptTutorial -SRCS = ${TARGET}.md ${wildcard *.bib} ${wildcard *.cls} ${wildcard *.sty} -CODE = ${wildcard code/*.c} \ - ${wildcard code/*.cry} \ - ${wildcard code/*.java} \ - ${wildcard code/*.saw} -TARBALL = tmp/saw-tutorial-code.tar.gz -SPELLSRC = ${TARGET}.tex -NEWSPELL = ${TARGET}.SPELLNEW -OLDSPELL = ${TARGET}.SPELLOLD -SPELL = aspell -t -l -AUX = ${wildcard *.blg} ${wildcard *.bbl} ${wildcard *.aux} \ - ${wildcard *.eps} ${wildcard *.log} ${wildcard *.toc} \ - ${TARGET}.md -PDFARGS = -H tutorial-head.tex \ - -B tutorial-before.tex \ - -A tutorial-after.tex \ - --toc \ - --listings \ - -V documentclass:galois-whitepaper \ - -V fontsize:12 \ - --pdf-engine=xelatex -HTMLARGS = --css doc.css \ - -B tutorial-before.html \ - --toc \ - --standalone \ - --metadata title="SAWScript Tutorial" \ - --self-contained - -all: ${TARGET}.pdf ${TARGET}.html ${TARBALL} - -${TARGET}.pdf: ${SRCS} Makefile | tmp - pandoc ${PDFARGS} -o $@ ${TARGET}.md - -${TARGET}.html: ${SRCS} Makefile | tmp - pandoc ${HTMLARGS} -o $@ ${TARGET}.md - -${TARBALL}: ${CODE} - tar czf ${TARBALL} code - -# Pre-processing step. Right now, does nothing. -${TARGET}.md: tutorial.md docode.hs ${CODE} | tmp - runhaskell docode.hs < $< > $@ - -docode: ${TARGET}.md | tmp - -.PHONY: spellClean superClean clean quickSpell - -#run: -# sawScript ${TARGET}.tex -c ../../Examples/ECC/build -j ../../jdk1.6/classes.jar - -tmp: - mkdir -p tmp - -clean: - -rm -f ${AUX} - -superClean: clean - -rm -f ${TARGET}.pdf ${TARGET}.html ${TARGET}.md tmp - -spellClean: - rm -f ${NEWSPELL} ${OLDSPELL} - -quickSpell: - @touch ${NEWSPELL} - @mv -f ${NEWSPELL} ${OLDSPELL} - @cat ${SPELLSRC} | ${SPELL} | tr "A-Z" "a-z" | sort | uniq | less > ${NEWSPELL} - @echo '(The ones marked with < are new.)' - @diff ${NEWSPELL} ${OLDSPELL} diff --git a/doc/tutorial/README.md b/doc/tutorial/README.md deleted file mode 100644 index a6f348e7fe..0000000000 --- a/doc/tutorial/README.md +++ /dev/null @@ -1,4 +0,0 @@ -To update the tutorial, edit `tutorial.md` and then run `make`. The -output is generated in `./tmp`. If you want to version a new copy of -the tutorial, copy `./tmp/sawScriptTutorial.pdf` to -`./sawScriptTutorial.pdf` and commit. diff --git a/doc/tutorial/doc.css b/doc/tutorial/doc.css deleted file mode 100644 index 08327aa2df..0000000000 --- a/doc/tutorial/doc.css +++ /dev/null @@ -1,93 +0,0 @@ -body { - font-size: 14px; - line-height: 21px; - color: #333; - font-family: "Helvetica Neue", "Lucida Grande", "Lucida Sans Unicode", Helvetica, Arial, sans-serif !important; -} -.container { - width: 950px; - margin: 0; - padding: 80px 0px 50px 50px; -} -p, li { - width: 625px; -} -a { - color: #191933; -} -h1, h2, h3, h4, h5, h6, b.header { - color: #000; - margin-top: 40px; - margin-bottom: 15px; - text-shadow: #fff 0 1px 1px; -} -h1 { font-size: 18px } -h2 { font-size: 16px } -h3, h4, h5, h6 { font-size: 14px } -.author { font-weight: normal } - -br.clear { - height: 0; - clear: both; -} -ul { - padding-left: 20px; -} -b.header { - display: block; -} -li { - margin-bottom: 10px; -} -code, pre, tt, textarea { - font-family: Monaco, Consolas, "Lucida Console", monospace; - font-size: 12px; - line-height: 18px; - color: #155; - white-space: pre-wrap; - word-wrap: break-word; -} - tt { - display: inline-block; - background: #fff; - border: 1px solid #dedede; - padding: 0px 0.2em; - } - pre { - border-left: 5px solid rgba(0,0,0,0.2); - padding: 3px 0 3px 12px; - font-size: 12px; - } - pre.no_bar { - border-left: 0; - margin-left: 0; - padding-left: 0; - } -div.code { - position: relative; - background: #fff; - border: 1px solid #d8d8d8; - -webkit-box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - -moz-box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - box-shadow: 0px 0px 4px rgba(0,0,0,0.23); - zoom: 1; -} - div.code { - text-transform: none; - position: absolute; - right: 8px; bottom: 8px; - } - div.code { - left: 8px; right: auto; - } - div.code pre, div.code textarea { - float: left; - width: 450px; - background: #fff; - border-left: 1px dotted #d0d0d0; - margin: 10px 0 15px 3px; - padding: 0 0 0 12px; - } - div.code pre:first-child { - border-left: 0; - } diff --git a/doc/tutorial/docode.hs b/doc/tutorial/docode.hs deleted file mode 100644 index 550c9fec15..0000000000 --- a/doc/tutorial/docode.hs +++ /dev/null @@ -1,30 +0,0 @@ -module Main where - -import Control.Monad -import System.Process - -interactIO :: (String -> IO String) -> IO () -interactIO act = do - txt <- getContents - mapM_ (act >=> putStr) (lines txt) - --- TODO: this does no error handling whatsoever -processCommand :: String -> IO String -processCommand line = - case words line of - ("$cmd":cmd:args) -> do - result <- readProcess cmd args "" - return $ unlines [ unwords ("#" : cmd : args) ] ++ result - ["$include", "all", file] -> readFile file - ["$include", range, file] -> - do txt <- readFile file - return . unlines . take (endN - startN) $ drop startN (lines txt) - where - (start, end') = break (== '-') range - end = tail end' - startN = read start - 1 - endN = read end - _ -> return (line ++ "\n") - -main :: IO () -main = interactIO processCommand diff --git a/doc/tutorial/figures/galois.pdf b/doc/tutorial/figures/galois.pdf deleted file mode 100644 index 58cbef37a0..0000000000 Binary files a/doc/tutorial/figures/galois.pdf and /dev/null differ diff --git a/doc/tutorial/figures/galois.png b/doc/tutorial/figures/galois.png deleted file mode 100644 index 39a5ba86cb..0000000000 Binary files a/doc/tutorial/figures/galois.png and /dev/null differ diff --git a/doc/tutorial/galois-whitepaper.cls b/doc/tutorial/galois-whitepaper.cls deleted file mode 100644 index a4b29bc2e7..0000000000 --- a/doc/tutorial/galois-whitepaper.cls +++ /dev/null @@ -1,143 +0,0 @@ -% This class file defines a style useful for writing whitepapers at Galois. -\ProvidesClass{galois-whitepaper}[2010/11/02 Galois whitepaper class] -\newif\if@xelatex -\@xelatexfalse -\DeclareOption{xelatex}{\@xelatextrue} -\DeclareOption*{% -\PassOptionsToClass{\CurrentOption}{article}% -} -\ProcessOptions\relax -\LoadClass{article} - -% Useful packages: -% \RequirePackage[table]{xcolor} -\RequirePackage{graphicx} -%\RequirePackage[headheight=144pt,top=1in,left=1in,right=1in,bottom=1in]{geometry} -\RequirePackage[top=1in,left=1in,right=1in,bottom=1in]{geometry} -\RequirePackage{lastpage} - -% Define fonts -\if@xelatex -\RequirePackage{fontspec} -\RequirePackage{xunicode} -\RequirePackage{xltxtra} -\setmainfont[Mapping=tex-text]{Palatino} -\setsansfont[Mapping=tex-text]{Arial} -\else -\RequirePackage[sc]{mathpazo} -\linespread{1.05} -\RequirePackage[scaled]{helvet} -\fi - -\RequirePackage{sectsty} -\allsectionsfont{\sffamily} - -\newcommand{\highlight}[2][yellow]{\colorbox{#1}{#2}} - -% Misc defines -\makeatletter - \newcommand{\@companyName}{Galois, Inc.} - \newcommand{\@addressPhone}{ - 421 SW 6th Ave., Ste.\ 300\\ - Portland, OR 97204 - } - \newcommand{\@contact}{\todo[Contact]} - \newcommand{\@subtitle}{TODO: Subtitle} - \newcommand{\contact}[1]{\renewcommand{\@contact}{{#1}}} -\makeatother - -\renewcommand{\paragraph}[1]{\textbf{\sffamily{#1}}\quad} - -% \definecolor{galoislightgray}{gray}{0.6} -% Define header and footer -\RequirePackage{fancyhdr} -\pagestyle{fancy} -\fancyheadoffset[l]{1in} -\fancyheadoffset[r]{0in} -%\fancyheadoffset[r]{0.2in} -\fancyhead{}%clear -\fancyfoot{}%clear -\renewcommand{\headrulewidth}{0pt} -\renewcommand{\footrulewidth}{1pt} - - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Define a special header style for the title page: -% - \makeatletter - \newcommand{\ps@title}{% - \renewcommand{\@oddhead}{% - \sffamily\includegraphics[width=1in]{figures/galois.pdf} - }} -\makeatother - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Set up the headers/footers for the rest of the document: -%% -\lhead{\sffamily\hspace{1in}\includegraphics[width=1in]{figures/galois.pdf}} - -\makeatletter -\rhead{\sffamily \textit{\@title{}}} -\makeatother - -\lfoot{\sffamily\color{lightgray}\small\copyright{} 2019 Galois, Inc.} - -\rfoot{ - % \sffamily \color{galoislightgray}\small - \sffamily \color{lightgray}\small - Page \thepage{}% of \pageref{LastPage} -} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% The title content definition: -%% -\makeatletter -\DeclareRobustCommand{\maketitle}{ - \thispagestyle{title} - \begin{center} - {\sffamily - {\large \textbf{}}\\ - \vspace{0.3in} - \textbf{\sffamily \LARGE \@title}\\ - %\vspace{0.5in} - %{ \@author }\\ - \vspace{0.3in} - {\large \@companyName }\\ - \vspace{0.025in} - {\small \@addressPhone }\\ - } - \end{center} -} - -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -%% -%% Add a stop ``.'' after each section number: -%% - -% Redefine the bibliography environment to use \section{\bibname} -% instead of \section*{\bibname} -\newcommand{\bibname}{References} -\renewenvironment{thebibliography}[1] - {\section*{\bibname}% <-- this line was changed from \chapter* to \section* - \@mkboth{\MakeUppercase\bibname}{\MakeUppercase\bibname}% - \list{\@biblabel{\@arabic\c@enumiv}}% - {\settowidth\labelwidth{\@biblabel{#1}}% - \leftmargin\labelwidth - \advance\leftmargin\labelsep - \@openbib@code - \usecounter{enumiv}% - \let\p@enumiv\@empty - \renewcommand\theenumiv{\@arabic\c@enumiv}}% - \sloppy - \clubpenalty4000 - \@clubpenalty \clubpenalty - \widowpenalty4000% - \sfcode`\.\@m} - {\def\@noitemerr - {\@latex@warning{Empty `thebibliography' environment}}% - \endlist} - -\makeatother diff --git a/doc/tutorial/sawScriptTutorial.pdf b/doc/tutorial/sawScriptTutorial.pdf deleted file mode 100644 index 263092dca4..0000000000 Binary files a/doc/tutorial/sawScriptTutorial.pdf and /dev/null differ diff --git a/doc/tutorial/tutorial-after.tex b/doc/tutorial/tutorial-after.tex deleted file mode 100644 index cc9383c255..0000000000 --- a/doc/tutorial/tutorial-after.tex +++ /dev/null @@ -1,4 +0,0 @@ -\newpage -\bibliography{sawScriptTutorial} -\addcontentsline{toc}{section}{References} -\bibliographystyle{abbrv} diff --git a/doc/tutorial/tutorial-before.html b/doc/tutorial/tutorial-before.html deleted file mode 100644 index 2ce0a49c74..0000000000 --- a/doc/tutorial/tutorial-before.html +++ /dev/null @@ -1 +0,0 @@ - diff --git a/doc/tutorial/tutorial-before.tex b/doc/tutorial/tutorial-before.tex deleted file mode 100644 index 2a8b21e849..0000000000 --- a/doc/tutorial/tutorial-before.tex +++ /dev/null @@ -1 +0,0 @@ -\maketitle diff --git a/doc/tutorial/tutorial-head.tex b/doc/tutorial/tutorial-head.tex deleted file mode 100644 index 6da511e711..0000000000 --- a/doc/tutorial/tutorial-head.tex +++ /dev/null @@ -1,60 +0,0 @@ -\usepackage{listings} -\usepackage{float} -\usepackage{xspace} -\usepackage{color} -\usepackage{tikz} -\usepackage{url} -\usepackage{amsmath} -\usepackage{amscd} -\usepackage{verbatim} -\usepackage{fancyvrb} - -\let\verbatiminput=\verbatimtabinput -\VerbatimFootnotes -\DefineVerbatimEnvironment{code}{Verbatim}{} -\DefineVerbatimEnvironment{pseudoCode}{Verbatim}{} -\hyphenation{SAW-Script} -\newcommand{\sawScript}{{\sc SAWScript}\xspace} -\renewcommand{\textfraction}{0.05} -\renewcommand{\topfraction}{0.95} -\renewcommand{\bottomfraction}{0.95} -\renewcommand{\floatpagefraction}{0.35} -\setcounter{totalnumber}{5} -\definecolor{MyGray}{rgb}{0.9,0.9,0.9} -\makeatletter\newenvironment{graybox}{% - \begin{lrbox}{\@tempboxa}\begin{minipage}{\columnwidth}}{\end{minipage}\end{lrbox}% - \colorbox{MyGray}{\usebox{\@tempboxa}} -}\makeatother - -\setlength{\parskip}{0.6em} -\setlength{\abovecaptionskip}{0.5em} - -\lstset{ - basicstyle=\footnotesize\ttfamily, % Standardschrift - %numbers=left, % Ort der Zeilennummern - numberstyle=\tiny, % Stil der Zeilennummern - %stepnumber=2, % Abstand zwischen den Zeilennummern - numbersep=5pt, % Abstand der Nummern zum Text - tabsize=2, % Groesse von Tabs - extendedchars=true, % - breaklines=true, % Zeilen werden Umgebrochen - keywordstyle=\color{red}, - frame=lrtb, % left, right, top, bottom frames. - % keywordstyle=[1]\textbf, % Stil der Keywords - % keywordstyle=[2]\textbf, % - % keywordstyle=[3]\textbf, % - % keywordstyle=[4]\textbf, \sqrt{\sqrt{}} % - stringstyle=\color{white}\ttfamily, % Farbe der String - showspaces=false, % Leerzeichen anzeigen ? - showtabs=false, % Tabs anzeigen ? - xleftmargin=10pt, % was 17 - xrightmargin=5pt, - framexleftmargin=5pt, % was 17 - framexrightmargin=-1pt, % was 5pt - framexbottommargin=4pt, - %backgroundcolor=\color{lightgray}, - showstringspaces=false % Leerzeichen in Strings anzeigen ? -} - -\author{The Galois SAW Team\\\texttt{saw@galois.com}} -\title{SAWScript Tutorial} diff --git a/doc/tutorial/tutorial.md b/doc/tutorial/tutorial.md deleted file mode 100644 index 80ab805ae9..0000000000 --- a/doc/tutorial/tutorial.md +++ /dev/null @@ -1,519 +0,0 @@ -Introduction -============ - -SAWScript is a special-purpose programming language developed by -Galois to help orchestrate and track the results of the large -collection of proof tools necessary for analysis and verification of -complex software artifacts. - -The language adopts the functional paradigm, and largely follows the -structure of many other typed functional languages, with some special -features specifically targeted at the coordination of verification and -analysis tasks. - -This tutorial introduces the details of the language by walking through -several examples, and showing how simple verification tasks can be -described. The complete examples are available [on -GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/tutorial/code). -Most of the examples make use of inline specifications written in -Cryptol, a language originally designed for high-level descriptions of -cryptographic algorithms. For readers unfamiliar with Cryptol, various -documents describing its use are available -[here](http://cryptol.net/documentation.html). - -Example: Find First Set -======================= - -As a first example, we consider showing the equivalence of several quite -different implementations of the POSIX `ffs` function, which identifies -the position of the first ``1`` bit in a word. The function takes an -integer as input, treated as a vector of bits, and returns another -integer which indicates the index of the first bit set (zero being the -least significant). This function can be implemented in several ways -with different performance and code clarity tradeoffs, and we would like -to show those different implementations are equivalent. - -Reference Implementation -------------------------- - -One simple implementation takes the form of a loop with an index -initialized to zero, and a mask initialized to have the least -significant bit set. On each iteration, we increment the index, and -shift the mask to the left. Then we can use a bitwise "and" operation to -test the bit at the index indicated by the index variable. The following -C code (which is also in the `ffs.c` file on GitHub) uses this approach. - -``` {.c} -$include 9-17 code/ffs.c -``` - -This implementation is relatively straightforward, and a proficient C -programmer would probably have little difficulty believing its -correctness. However, the number of branches taken during execution -could be as many as 32, depending on the input value. It's possible to -implement the same algorithm with significantly fewer branches, and no -backward branches. - -Alternative Implementations ---------------------------- - -An alternative implementation, taken by the following function (also in -`ffs.c`), treats the bits of the input word in chunks, allowing -sequences of zero bits to be skipped over more quickly. - -``` {.c} -$include 19-26 code/ffs.c -``` - -Another optimized version, in the following rather mysterious program -(also in `ffs.c`), based on the `ffs` implementation in [musl -libc](http://musl.libc.org/). - -``` {.c} -$include 69-76 code/ffs.c -``` - -These optimized versions are much less obvious than the reference -implementation. They might be faster, but how do we gain confidence -that they calculate the same results as the reference implementation? - -Finally, consider a buggy implementation which is correct on all but one -possible input (also in `ffs.c`). Although contrived, this program -represents a case where traditional testing -- as opposed to -verification -- is unlikely to be helpful. - -``` {.c} -$include 43-47 code/ffs.c -``` - -SAWScript allows us to state these problems concisely, and to quickly -and automatically 1) prove the equivalence of the reference and -optimized implementations on all possible inputs, and 2) find an input -exhibiting the bug in the third version. - -Generating LLVM Code --------------------- - -SAW can analyze LLVM code, but most programs are originally written in a -higher-level language such as C, as in our example. Therefore, the C -code must be translated to LLVM, using something like the following -command: - - > clang -g -c -emit-llvm -o ffs.bc ffs.c - -The `-g` flag instructs `clang` to include debugging information, which -is useful in SAW to refer to variables and struct fields using the same -names as in C. We have tested SAW successfully with versions of `clang` -from 3.6 to 7.0. Please report it as a bug [on -GitHub](https://github.com/GaloisInc/saw-script/issues) if SAW fails to -parse any LLVM bitcode file. - -This command, and following command examples in this tutorial, can be -run from the `code` directory [on -GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/tutorial/code). -A `Makefile` also exists in that directory, providing quick shortcuts -for tasks like this. For instance, we can get the same effect as the -previous command by running: - - > make ffs.bc - -\newpage - -Equivalence Proof ------------------ - -We now show how to use SAWScript to prove the equivalence of the -reference and implementation versions of the FFS algorithm, and -exhibit the bug in the buggy implementation. - -A SAWScript program is typically structured as a sequence of commands, -potentially along with definitions of functions that abstract over -commonly-used combinations of commands. - -The following script (in `ffs_llvm.saw`) is sufficient to automatically -prove the equivalence of `ffs_ref` with `ffs_imp` and `ffs_musl`, and -identify the bug in `ffs_bug`. - -``` -$include all code/ffs_llvm.saw -``` - -In this script, the `print` commands simply display text for the user. -The `llvm_extract` command instructs the SAWScript interpreter -to perform symbolic simulation of the given C function (e.g., `ffs_ref`) -from a given bitcode file (e.g., `ffs.bc`), and return a term -representing the semantics of the function. - -The `let` statement then constructs a new term corresponding to the -assertion of equality between two existing terms. Arbitrary -Cryptol expressions can be embedded within SAWScript; to distinguish -Cryptol code from SAWScript commands, the Cryptol code is placed -within double brackets `{{` and `}}`. - -The `prove` command can verify the validity of such an assertion, or -produce a counter-example that invalidates it. The `abc` parameter -indicates what theorem prover to use; SAWScript offers support for many -other SAT and SMT solvers as well as user definable simplification -tactics. - -Similarly, the `sat` command works in the opposite direction to `prove`. -It attempts to find a value for which the given assertion is true, and -fails if there is no such value. - -If the `saw` executable is in your PATH, you can run the script above with - - > saw ffs_llvm.saw - -producing the output - -``` -Loading file "ffs_llvm.saw" -Extracting reference term: ffs_ref -Extracting implementation term: ffs_imp -Extracting implementation term: ffs_musl -Extracting buggy term: ffs_bug -Proving equivalence: ffs_ref == ffs_imp -Valid -Proving equivalence: ffs_ref == ffs_musl -Valid -Finding bug via sat search: ffs_ref != ffs_bug -Sat: [x = 0x101010] -Finding bug via failed proof: ffs_ref == ffs_bug -prove: 1 unsolved subgoal(s) -Invalid: [x = 0x101010] -Done. -``` - -Note that both explicitly searching for an input exhibiting the bug -(with `sat`) and attempting to prove the false equivalence (with -`prove`) exhibit the bug. Symmetrically, we could use `sat` to prove the -equivalence of `ffs_ref` and `ffs_imp`, by checking that the -corresponding disequality is unsatisfiable. Indeed, this exactly what -happens behind the scenes: `prove abc ` is essentially `not (sat -abc (not ))`. - -Cross-Language Proofs ---------------------- - -We can implement the FFS algorithm in Java with code almost identical -to the C version. - -The reference version (in `FFS.java`) uses a loop, like the C version: - -``` {.java} -$include 2-10 code/FFS.java -``` - -And the efficient implementation uses a fixed sequence of masking and -shifting operations: - -``` {.java} -$include 12-19 code/FFS.java -``` - -Although in this case we can look at the C and Java code and see that -they perform almost identical operations, the low-level operators -available in C and Java do differ somewhat. Therefore, it would be -nice to be able to gain confidence that they do, indeed, perform the -same operation. - -We can do this with a process very similar to that used to compare the -reference and implementation versions of the algorithm in a single -language. - -First, we compile the Java code to a JVM class file. - - > javac -g FFS.java - -Like with `clang`, the `-g` flag instructs `javac` to include debugging -information, which can be useful to preserve variable names. - -Using `saw` with Java code requires a command-line option `-b` that -locates Java. Run the code in this section with the command: - - > saw -b ffs_compare.saw - -Alternatively, if Java is located on your `PATH`, you can omit the `-b` -option entirely. - -Both Oracle JDK and OpenJDK versions 6 through 8 work well with SAW. -SAW also includes experimental support for JDK 9 and later. Code that only -involves primitive data types (such as `FFS.java` above) is known to work well -under JDK 9+, but there are some as-of-yet unresolved issues in verifying code -involving classes such as `String`. For more information on these issues, refer -to [this GitHub issue](https://github.com/GaloisInc/crucible/issues/641). - -Now we can do the proof both within and across languages (from -`ffs_compare.saw`): - -``` -$include all code/ffs_compare.saw -``` - -Here, the `jvm_extract` function works like `llvm_extract`, but on a -Java class and method name. The `prove_print` command works similarly -to the `prove` followed by `print` combination used for the LLVM -example above. - -Using SMT-Lib Solvers -===================== - -The examples presented so far have used the internal proof system -provided by SAWScript, based primarily on a version of the ABC tool -from UC Berkeley linked into the `saw` executable. However, there is -internal support for other proof tools -- such as Yices and Z3 as -illustrated in the next example -- and more general support for -exporting models representing theorems as goals in the SMT-Lib -language. These exported goals can then be solved using an external -SMT solver. - -Consider the following C file: - -``` {.c} -$include all code/double.c -``` - -In this trivial example, an integer can be doubled either using -multiplication or shifting. The following SAWScript program (in -`double.saw`) verifies that the two are equivalent using both internal -Yices and Z3 modes, and by exporting an SMT-Lib theorem to be -checked later, by an external SAT solver. - -``` -$include all code/double.saw -``` - -The new primitives introduced here are the tilde operator, `~`, which -constructs the logical negation of a term, and `write_smtlib2`, which -writes a term as a proof obligation in SMT-Lib version 2 format. Because -SMT solvers are satisfiability solvers, their default behavior is to -treat free variables as existentially quantified. By negating the input -term, we can instead treat the free variables as universally quantified: -a result of "unsatisfiable" from the solver indicates that the original -term (before negation) is a valid theorem. The `prove` primitive does -this automatically, but for flexibility the `write_smtlib2` primitive -passes the given term through unchanged, because it might be used for -either satisfiability or validity checking. - -The SMT-Lib export capabilities in SAWScript make use of the Haskell SBV -package, and support ABC, Bitwuzla, Boolector, CVC4, CVC5, MathSAT, Yices, and -Z3. - -\newpage - -External SAT Solvers -==================== - -In addition to the `abc`, `z3`, and `yices` proof tactics used -above, SAWScript can also invoke arbitrary external SAT solvers -that read CNF files and produce results according to the SAT -competition -[input and output conventions](https://jix.github.io/varisat/manual/0.2.0/formats/dimacs.html), -using the `external_cnf_solver` tactic. For example, you can use -[PicoSAT](http://fmv.jku.at/picosat/) to prove the theorem `thm` from -the last example, with the following commands: - - let picosat = external_cnf_solver "picosat" ["%f"]; - prove_print picosat thm; - -The use of `let` is simply a convenient abbreviation. The following -would be equivalent: - - prove_print (external_cnf_solver "picosat" ["%f"]) thm; - -The first argument to `external_cnf_solver` is the name of the -executable. It can be a fully-qualified name, or simply the bare -executable name if it's in your PATH. The second argument is an array -of command-line arguments to the solver. Any occurrence of `%f` is -replaced with the name of the temporary file containing the CNF -representation of the term you're proving. - -The `external_cnf_solver` tactic is based on the same underlying -infrastructure as the `abc` tactic, and is generally capable of -proving the same variety of theorems. - -To write a CNF file without immediately invoking a solver, use the -`offline_cnf` tactic, or the `write_cnf` top-level command. - -Compositional Proofs -==================== - -The examples shown so far treat programs as monolithic entities. A -Java method or C function, along with all of its callees, is -translated into a single mathematical model. SAWScript also has -support for more compositional proofs, as well as proofs about -functions that use heap data structures. - -Compositional Imperative Proofs -------------------------------- - -As a simple example of compositional reasoning on imperative programs, -consider the following Java code. - -``` {.java} -$include all code/Add.java -``` - -Here, the `add` function computes the sum of its arguments. The `dbl` -function then calls `add` to double its argument. While it would be easy -to prove that `dbl` doubles its argument by following the call to `add`, -it's also possible in SAWScript to prove something about `add` first, -and then use the results of that proof in the proof of `dbl`, as in the -following SAWScript code (`java_add.saw` on GitHub). - -\newpage - -```` -$include all code/java_add.saw -```` - -This can be run as follows: - - > saw -b java_add.saw - -In this example, the definitions of `add_spec` and `dbl_spec` provide -extra information about how to configure the symbolic simulator when -analyzing Java code. In this case, the setup blocks provide explicit -descriptions of the implicit configuration used by -`jvm_extract` (used in the earlier Java FFS example and in the -next section). The `jvm_fresh_var` commands instruct the simulator to -create fresh symbolic inputs to correspond to the Java variables `x` and -`y`. Then, the `jvm_return` commands indicate the expected return value -of the each method, in terms of existing models (which can be written -inline). Because Java methods can operate on references, as well, which -do not exist in Cryptol, Cryptol expressions must be translated to JVM -values with `jvm_term`. - -To make use of these setup blocks, the `jvm_verify` command analyzes -the method corresponding to the class and method name provided, using -the setup block passed in as a parameter. It then returns an object -that describes the proof it has just performed. This object can be -passed into later instances of `jvm_verify` to indicate that calls to -the analyzed method do not need to be followed, and the previous proof -about that method can be used instead of re-analyzing it. - -Interactive Interpreter -======================= - -The examples so far have used SAWScript in batch mode on complete -script files. It also has an interactive Read-Eval-Print Loop (REPL) -which can be convenient for experimentation. To start the REPL, run -SAWScript with no arguments: - - > saw - -The REPL can evaluate any command that would appear at the top level -of a standalone script, or in the `main` function, as well as a few -special commands that start with a colon: - -\newpage - - :env display the current sawscript environment - :type check the type of an expression - :browse display the current environment - :eval evaluate an expression and print the result - :? display a brief description about a built-in operator - :help display a brief description about a built-in operator - :quit exit the REPL - :load load a module - :add load an additional module - :cd set the current working directory - -As an example of the sort of interactive use that the REPL allows, -consider the file `code/NQueens.cry`, which provides a Cryptol -specification of the problem of placing a specific number of queens on -a chess board in such a way that none of them threaten any of the -others. - -```` -$include 21-56 code/NQueens.cry -```` - -This example gives us the opportunity to use the satisfiability -checking capabilities of SAWScript on a problem other than -equivalence verification. - -First, we can load a model of the `nQueens` term from the Cryptol file. - - sawscript> m <- cryptol_load "NQueens.cry" - sawscript> let nq8 = {{ m::nQueens`{8} }} - -Once we've extracted this model, we can try it on a specific -configuration to see if it satisfies the property that none of the -queens threaten any of the others. - - sawscript> print {{ nq8 [0,1,2,3,4,5,6,7] }} - False - -This particular configuration didn't work, but we can use the -satisfiability checking tools to automatically find one that does. - - sawscript> sat_print abc nq8 - Sat [qs = [3, 1, 6, 2, 5, 7, 4, 0]] - -And, finally, we can double-check that this is indeed a valid solution. - - sawscript> print {{ nq8 [3,1,6,2,5,7,4,0] }} - True - -Other Examples -============== - -The `code` directory [on -GitHub](https://github.com/GaloisInc/saw-script/tree/master/doc/tutorial/code) -contains a few additional examples not mentioned so far. These remaining -examples don't cover significant new material, but help fill in some -extra use cases that are similar, but not identical to those already -covered. - -Java Equivalence Checking -------------------------- - -The previous examples showed comparison between two different LLVM -implementations, and cross-language comparisons between Cryptol, Java, -and LLVM. The script in `ffs_java.saw` compares two different Java -implementations, instead. - -```` -$include all code/ffs_java.saw -```` - -As with previous Java examples, this one needs to be run with the `-b` -flag to tell the interpreter where to find Java: - - > saw -b ffs_java.saw - -AIG Export and Import ---------------------- - -Most of the previous examples have used the `abc` tactic to discharge -theorems. This tactic works by translating the given term to -And-Inverter Graph (AIG) format, transforming the graph in various -ways, and then using a SAT solver to complete the proof. - -Alternatively, the `write_aig` command can be used to write an AIG -directly to a file, in [AIGER format](http://fmv.jku.at/aiger/), for -later processing by external tools, as shown in -`code/ffs_gen_aig.saw`. - -```` -$include all code/ffs_gen_aig.saw -```` - -Conversely, the `read_aig` command can construct an internal term from -an existing AIG file, as shown in `ffs_compare_aig.saw`. - -```` -$include all code/ffs_compare_aig.saw -```` - -We can use external AIGs to verify the equivalence as follows, -generating the AIGs with the first script and comparing them with the -second: - - > saw -b ffs_gen_aig.saw - > saw ffs_compare_aig.saw - -Files in AIGER format can be produced and processed by several -external tools, including ABC, Cryptol version 1, and various hardware -synthesis and verification systems. diff --git a/intTests/test0019_jvm_switch_statement/test.saw b/intTests/test0019_jvm_switch_statement/test.saw index d9276d10f2..8631996098 100644 --- a/intTests/test0019_jvm_switch_statement/test.saw +++ b/intTests/test0019_jvm_switch_statement/test.saw @@ -1,4 +1,4 @@ -import "../../doc/tutorial/code/ffs.cry"; +import "../../doc/llvm-java-verification-with-saw/code/ffs.cry"; FFS <- java_load_class "FFS"; java_ffs_imp <- jvm_extract FFS "ffs_imp"; prove_print abc {{ \x -> java_ffs_imp x == ffs_ref x }}; diff --git a/intTests/test_tutorial1/test.sh b/intTests/test_tutorial1/test.sh index 6b5d8942b8..ddebea3e67 100644 --- a/intTests/test_tutorial1/test.sh +++ b/intTests/test_tutorial1/test.sh @@ -2,7 +2,7 @@ set -e mkdir -p tmp -cp ../../doc/tutorial/code/* tmp +cp ../../doc/llvm-java-verification-with-saw/code/* tmp cd tmp $SAW ffs_java.saw $SAW ffs_java_crucible.saw diff --git a/intTests/test_tutorial2/test.sh b/intTests/test_tutorial2/test.sh index 77bf3d63a7..f7e9a7be7d 100644 --- a/intTests/test_tutorial2/test.sh +++ b/intTests/test_tutorial2/test.sh @@ -2,7 +2,7 @@ set -e mkdir -p tmp -cp ../../doc/tutorial/code/* tmp +cp ../../doc/llvm-java-verification-with-saw/code/* tmp cd tmp $SAW java_add.saw cd .. diff --git a/intTests/test_tutorial3/test.sh b/intTests/test_tutorial3/test.sh index 698fcbb456..d09a56ec2b 100644 --- a/intTests/test_tutorial3/test.sh +++ b/intTests/test_tutorial3/test.sh @@ -2,7 +2,7 @@ set -e mkdir -p tmp -cp ../../doc/tutorial/code/* tmp +cp ../../doc/llvm-java-verification-with-saw/code/* tmp cd tmp $SAW nqueens.saw cd .. diff --git a/intTests/test_tutorial5/test.sh b/intTests/test_tutorial5/test.sh index 9b0f8a1595..98b7edcc14 100644 --- a/intTests/test_tutorial5/test.sh +++ b/intTests/test_tutorial5/test.sh @@ -2,7 +2,7 @@ set -e mkdir -p tmp -cp ../../doc/tutorial/code/* tmp +cp ../../doc/llvm-java-verification-with-saw/code/* tmp cd tmp $SAW des-cryptol2.saw $SAW des3.saw diff --git a/intTests/test_tutorial_w4/test.sh b/intTests/test_tutorial_w4/test.sh index c413bf65b0..0e98fc96b1 100644 --- a/intTests/test_tutorial_w4/test.sh +++ b/intTests/test_tutorial_w4/test.sh @@ -2,7 +2,7 @@ set -e mkdir -p tmp -cp ../../doc/tutorial/code/* tmp +cp ../../doc/llvm-java-verification-with-saw/code/* tmp cd tmp sed -i.bak s/abc/w4/g ffs_java.saw sed -i.bak s/abc/w4/g ffs_llvm.saw