rems-task2.bib

@inproceedings{rems-fmics14,
  title = {Randomised Testing of a Microprocessor Model Using {SMT}-Solver State Generation},
  author = {Brian Campbell and Ian Stark},
  booktitle = {\textbf{FMICS 2014}: Formal Methods for Industrial Critical Systems},
  conf = {FMICS 2014},
  year = 2014,
  month = sep,
  editor = {Fr{\'{e}}d{\'{e}}ric Lang and Francesco Flammini},
  volume = 8718,
  series = {Lecture Notes in Computer Science},
  pages = {185--199},
  publisher = {Springer},
  doi = {10.1007/978-3-319-10702-8_13},
  slides = {rems/fmics-m0-talk.pdf},
  preprint = {rems/fmics2014.pdf},
  url = {http://homepages.inf.ed.ac.uk/bcampbe2/rems/fmics2014.pdf},
  preprintnote = {The final publication is available at link.springer.com},
  abstract = {We validate a HOL4 model of the ARM Cortex-M0 microcontroller core by testing the model's behaviour on randomly chosen
instructions against a real chip.
The model and our intended application involve precise timing information about instruction execution, but the implementations are pipelined,
so checking the behaviour of single instructions would not give us sufficient confidence in the model. Thus we test the model using sequences
of randomly chosen instructions.
The main challenge is to meet the constraints on the initial and intermediate execution states: we must ensure that memory accesses are in
range and that we respect restrictions on the instructions. By careful
transformation of these constraints an off-the-shelf SMT solver can be
used to find suitable states for executing test sequences.
}
}
@article{rems-scico15,
  author = {Brian Campbell and Ian Stark},
  title = {Randomised Testing of a Microprocessor Model Using {SMT}-Solver State Generation},
  journal = {\textbf{Sci. Comput. Program.}},
  volume = 118,
  pages = {60--76},
  year = 2016,
  month = mar,
  issn = {0167-6423},
  doi = {10.1016/j.scico.2015.10.012},
  pdf = {http://homepages.inf.ed.ac.uk/bcampbe2/rems/scico2015.pdf},
  abstract = {We validate a HOL4 model of the ARM Cortex-M0 microcontroller core by testing the model’s behaviour on randomly chosen instructions against real chips from several manufacturers.
The model and our intended application involve precise timing information about instruction execution, but the implementations are pipelined, so checking the behaviour of single instructions would not give us sufficient confidence in the model. Thus we test the model using sequences of randomly chosen instructions.
The main challenge is to meet the constraints on the initial and intermediate execution states: we must ensure that memory accesses are in range and that we respect restrictions on the instructions. By careful transformation of these constraints an off-the-shelf SMT solver can be used to find suitable states for executing test sequences. We also use additional constraints to test our hypotheses about the timing anomalies encountered.}
}
@misc{HCSS2015-Flur,
  optkey = {},
  author = {S. Flur and K. Gray and G. Kerneis and D. Mulligan and C. Pulte and S. Sarkar and P. Sewell},
  title = {Rigorous Architectural Modelling for Production Multiprocessors},
  opthowpublished = {},
  month = may,
  year = 2015,
  note = {Presentation at HCSS 2015: the Fifteenth Annual
High Confidence Software and Systems Conference},
  optannote = {},
  abstract = {
Processor architectures are critical interfaces in computing, but they
are typically defined only by prose and pseudocode documentation.
This is especially problematic for the subtle concurrency behaviour of
weakly consistent multiprocessors such as ARM and IBM POWER: the
traditional documentation does not define precisely what
programmer-observable behaviour is (and is not) allowed for concurrent
code; it is not executable as a test oracle for pre-Silicon or
post-Silicon hardware testing; it is not executable as an emulator for
software testing; and it is not mathematically rigorous enough to
serve as a foundation for software verification.

We present a rigorous architectural envelope model for IBM POWER
multiprocessors, and similar work in progress for ARM, that aims to
support all of these for small-but-intricate test cases, integrating
an operational concurrency model with an ISA model for the sequential
behaviour of a substantial fragment of the user-mode instruction set
(largely automatically derived from the vendor pseudocode, and
expressed in a new ISA description language).  The key question is the
interface between these two: we have to allow all the required
relaxed-memory behaviour, without overcommitting to some particular
microarchitectural model.  Our models can be automatically translated
into executable code, which, combined with front-ends for concurrency
litmus tests and ELF executables, can interactively or exhaustively
explore all the allowed behaviours of small test cases.
}
}
@inproceedings{DBLP:conf/asplos/AlglaveBDGKPSW15,
  author = {Jade Alglave and
               Mark Batty and
               Alastair F. Donaldson and
               Ganesh Gopalakrishnan and
               Jeroen Ketema and
               Daniel Poetzl and
               Tyler Sorensen and
               John Wickerson},
  title = {{GPU} Concurrency: Weak Behaviours and Programming Assumptions},
  booktitle = {\textbf{ASPLOS 2015}: Proceedings of the Twentieth International Conference on Architectural
               Support for Programming Languages and Operating Systems, Istanbul, Turkey},
  conf = {ASPLOS 2015},
  pages = {577--591},
  year = {2015},
  month = mar,
  optcrossref = {DBLP:conf/asplos/2015},
  url = {http://doi.acm.org/10.1145/2694344.2694391},
  doi = {10.1145/2694344.2694391},
  timestamp = {Thu, 26 Feb 2015 17:21:18 +0100},
  biburl = {http://dblp.uni-trier.de/rec/bib/conf/asplos/AlglaveBDGKPSW15},
  bibsource = {dblp computer science bibliography, http://dblp.org},
  abstract = {
Concurrency is pervasive and perplexing, particularly on graphics processing units (GPUs). Current specifications of languages and hardware are inconclusive; thus programmers often rely on folklore assumptions when writing software.

To remedy this state of affairs, we conducted a large empirical study of the concurrent behaviour of deployed GPUs. Armed with litmus tests (i.e. short concurrent programs), we questioned the assumptions in programming guides and vendor documentation about the guarantees provided by hardware. We developed a tool to generate thousands of litmus tests and run them under stressful workloads. We observed a litany of previously elusive weak behaviours, and exposed folklore beliefs about GPU programming---often supported by official tutorials---as false.

As a way forward, we propose a model of Nvidia GPU hardware, which correctly models every behaviour witnessed in our experiments. The model is a variant of SPARC Relaxed Memory Order (RMO), structured following the GPU concurrency hierarchy.
}
}
@inproceedings{CHERI-Oakland,
  author = {Robert N. M. Watson and Jonathan Woodruff and Peter G. Neumann and Simon W. Moore and Jonathan Anderson and David Chisnall and Nirav Dave and Brooks Davis and Khilan Gudka and Ben Laurie and Steven J. Murdoch and Robert Norton and Michael Roe and Stacey Son and Munraj Vadera},
  title = {{CHERI}: A Hybrid Capability-System Architecture for Scalable Software Compartmentalization},
  optcrossref = {},
  optkey = {},
  booktitle = {\textbf{Security and Privacy 2015}: {Proceedings of the 36th IEEE Symposium on Security and Privacy (``Oakland'')}},
  conf = {Security and Privacy 2015},
  optpages = {},
  year = {2015},
  opteditor = {},
  optvolume = {},
  optnumber = {},
  optseries = {},
  optaddress = {},
  month = may,
  optorganization = {},
  optpublisher = {},
  optnote = {},
  optannote = {},
  doi = {http://dx.doi.org/10.1109/SP.2015.9},
  url = {http://www.cl.cam.ac.uk/research/security/ctsrd/pdfs/201505-oakland2015-cheri-compartmentalization.pdf},
  abstract = {CHERI extends a conventional RISC Instruction-Set Architecture, compiler, and operating system to support
fine-grained, capability-based memory protection to mitigate
memory-related vulnerabilities in C-language TCBs. We describe
how CHERI capabilities can also underpin a hardware-software
object-capability model for application compartmentalization
that can mitigate broader classes of attack. Prototyped as an
extension to the open-source 64-bit BERI RISC FPGA softcore processor, FreeBSD operating system, and LLVM compiler,
we demonstrate multiple orders-of-magnitude improvement in
scalability, simplified programmability, and resulting tangible
security benefits as compared to compartmentalization based on
pure Memory-Management Unit (MMU) designs. We evaluate
incrementally deployable CHERI-based compartmentalization
using several real-world UNIX libraries and applications.
}
}
@inproceedings{DBLP:conf/micro/GrayKMPSS15,
  author = {Kathryn E. Gray and
               Gabriel Kerneis and
               Dominic P. Mulligan and
               Christopher Pulte and
               Susmit Sarkar and
               Peter Sewell},
  title = {An integrated concurrency and core-{ISA} architectural envelope definition,
               and test oracle, for {IBM} {POWER} multiprocessors},
  abstract = {Weakly consistent multiprocessors such as ARM and IBM POWER have been with us for decades, but their subtle programmer-visible concurrency behaviour remains challenging, both to implement and to use; the traditional architecture documentation, with its mix of prose and pseudocode, leaves much unclear.

In this paper we show how a precise architectural envelope model for such architectures can be defined, taking IBM POWER as our example.  Our model specifies, for an arbitrary test program, the set of all its allowable executions, not just those of some particular implementation.  The model integrates an operational concurrency model with an ISA model for the fixed-point non-vector user-mode instruction set (largely automatically derived from the vendor pseudocode, and expressed in a new ISA description language).  The key question is the interface between these two: allowing all the required concurrency behaviour, without overcommitting to some particular microarchitectural implementation, requires a novel abstract structure.

Our model is expressed in a mathematically rigorous language that can be automatically translated to an executable test-oracle tool; this lets one either interactively explore or exhaustively compute the set of all allowed behaviours of intricate test cases, to provide a reference for hardware and software development.
},
  booktitle = {\textbf{MICRO 2015}: Proceedings of the 48th International Symposium on Microarchitecture,
            , Waikiki, HI, USA},
  conf = {MICRO 2015},
  pages = {635--646},
  year = {2015},
  month = dec,
  optcrossref = {DBLP:conf/micro/2015},
  url = {http://doi.acm.org/10.1145/2830772.2830775},
  pdf = {http://www.cl.cam.ac.uk/~pes20/micro-48-2015.pdf},
  doi = {10.1145/2830772.2830775},
  timestamp = {Sun, 10 Jan 2016 12:42:56 +0100},
  biburl = {http://dblp.uni-trier.de/rec/bib/conf/micro/GrayKMPSS15},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}
@inproceedings{DBLP:conf/popl/FlurGPSSMDS16,
  author = {Shaked Flur and
               Kathryn E. Gray and
               Christopher Pulte and
               Susmit Sarkar and
               Ali Sezgin and
               Luc Maranget and
               Will Deacon and
               Peter Sewell},
  title = {Modelling the {ARMv8} architecture, operationally: concurrency and {ISA}},
  abstract = {In this paper we develop semantics for key aspects of the ARMv8 multiprocessor architecture: the concurrency model and much of the 64-bit application-level instruction set (ISA).  Our goal is to clarify what the range of architecturally allowable behaviour is, and thereby to support future work on formal verification, analysis, and testing of concurrent ARM software and hardware.

Establishing such models with high confidence is intrinsically difficult: it involves capturing the vendor's architectural intent, aspects of which (especially for concurrency) have not previously been precisely defined.  We therefore first develop a concurrency model with a microarchitectural flavour, abstracting from many hardware implementation concerns but still close to hardware-designer intuition.  This means it can be discussed in detail with ARM architects.  We then develop a more abstract model, better suited for use as an architectural specification, which we prove sound w.r.t.~the first.

The instruction semantics involves further difficulties, handling the mass of detail and the subtle intensional information required to interface to the concurrency model.  We have a novel ISA description language, with a lightweight dependent type system, letting us do both with a rather direct representation of the ARM reference manual instruction descriptions.

We build a tool from the combined semantics that lets one explore, either interactively or exhaustively, the full range of architecturally allowed behaviour, for litmus tests and (small) ELF executables.  We prove correctness of some optimisations needed for tool performance.

We validate the models by discussion with ARM staff, and by comparison against ARM hardware behaviour, for ISA single- instruction tests and concurrent litmus tests.},
  booktitle = {\textbf{POPL 2016}: Proceedings of the 43rd Annual {ACM} {SIGPLAN-SIGACT} Symposium on
               Principles of Programming Languages, St. Petersburg,
               FL, USA },
  conf = {POPL 2016},
  pages = {608--621},
  year = {2016},
  month = jan,
  optcrossref = {DBLP:conf/popl/2016},
  url = {http://doi.acm.org/10.1145/2837614.2837615},
  pdf = {http://www.cl.cam.ac.uk/~pes20/popl16-armv8/top.pdf},
  doi = {10.1145/2837614.2837615},
  timestamp = {Fri, 08 Jan 2016 18:18:25 +0100},
  biburl = {http://dblp.uni-trier.de/rec/bib/conf/popl/FlurGPSSMDS16},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}
@inproceedings{DBLP:conf/memocode/NaylorM15,
  author = {Matthew Naylor and
       Simon W. Moore},
  title = {A generic synthesisable test bench},
  booktitle = {\textbf{MEMOCODE 2015}: {ACM/IEEE} International Conference on Formal
               Methods and Models
       for Codesign,  Austin, TX, USA},
  conf = {MEMOCODE 2015},
  pages = {128--137},
  year = {2015},
  month = sep,
  url = {http://dx.doi.org/10.1109/MEMCOD.2015.7340479},
  doi = {10.1109/MEMCOD.2015.7340479},
  timestamp = {Tue, 08 Dec 2015 17:08:30 +0100},
  biburl = {http://dblp.uni-trier.de/rec/bib/conf/memocode/NaylorM15},
  bibsource = {dblp computer science bibliography, http://dblp.org},
  pdf = {http://www.cl.cam.ac.uk/research/security/ctsrd/pdfs/201509-memocode2015-bluecheck.pdf},
  abstract = {
Writing test benches is one of the most frequently-performed tasks in
the hardware development process.  The ability to reuse common
test bench features is therefore key to productivity.  In this paper,
we present a generic test bench, parameterised by a
specification of correctness, which can be used to test any
design.  Our test bench provides several important features, including
automatic test-sequence generation and shrinking of counter-examples,
and is fully synthesisable, allowing rigorous testing on FPGA as well
as in simulation.  The approach is easy to use, cheap to implement,
and encourages the formal specification of hardware components through
the reward of automatic testing and simple failure cases.
  }
}
@techreport{UCAM-CL-TR-876,
  author = {Watson, Robert N. M. and Neumann, Peter G. and Woodruff,
          	  Jonathan and Roe, Michael and Anderson, Jonathan and
          	  Chisnall, David and Davis, Brooks and Joannou, Alexandre
          	  and Laurie, Ben and Moore, Simon W. and Murdoch, Steven J.
          	  and Norton, Robert and Son, Stacey},
  title = {{Capability Hardware Enhanced RISC Instructions: CHERI
         	   Instruction-Set Architecture}},
  year = 2015,
  month = sep,
  url = {http://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-876.pdf},
  institution = {University of Cambridge, Computer Laboratory},
  number = {UCAM-CL-TR-876},
  abstract = {This technical report describes CHERI ISAv4, the fourth version of the Capability Hardware Enhanced RISC Instructions (CHERI) Instruction-Set Architecture (ISA). CHERI is being developed by SRI International and the University of Cambridge. This design captures four years of research, development, refinement, formal analysis, and testing, and is a substantial enhancement to the ISA version described in UCAM-CL-TR-850. Key improvements lie in tighter C-language integration, and more mature support for software object-capability models; these changes result from experience gained in adapting substantial software stacks to run on prototype hardware.

The CHERI instruction set is based on a hybrid capability-system architecture that adds new capability-system primitives to a commodity 64-bit RISC ISA enabling software to efficiently implement fine-grained memory protection and a hardware-software object-capability security model. These extensions support incrementally adoptable, high-performance, formally based, programmer-friendly underpinnings for fine-grained software decomposition and compartmentalization, motivated by and capable of enforcing the principle of least privilege.

The CHERI system architecture purposefully addresses known performance and robustness gaps in commodity ISAs that hinder the adoption of more secure programming models centered around the principle of least privilege. To this end, CHERI blends traditional paged virtual memory with a per-address-space capability model that includes capability registers, capability instructions, and tagged memory that have been added to the 64-bit MIPS ISA via a new capability coprocessor. CHERI also learns from the C-language fat-pointer literature: CHERI capabilities can describe not only regions of memory, but can also capture C pointer semantics allowing capabilities to be substituted for pointers in generated code.

CHERI’s hybrid system approach, inspired by the Capsicum security model, allows incremental adoption of capability-oriented software design: software implementations that are more robust and resilient can be deployed where they are most needed, while leaving less critical software largely unmodified, but nevertheless suitably constrained to be incapable of having adverse effects. For example, we are focusing conversion efforts on low-level TCB components of the system: separation kernels, hypervisors, operating system kernels, language runtimes, and userspace TCBs such as web browsers. Likewise, we see early-use scenarios (such as data compression, protocol parsing, image processing, and video processing) that relate to particularly high-risk software libraries, which are concentrations of both complex and historically vulnerability-prone code combined with untrustworthy data sources, while leaving containing applications unchanged.

This report describes the CHERI Instruction-Set Architecture (ISA) and design, and provides reference documentation and potential memory models, along with their requirements. It also briefly addresses the CHERI system hardware-software architecture, documenting our current thinking on integrating programming languages and operating systems with the CHERI hardware.}
}
@techreport{UCAM-CL-TR-877,
  author = {Watson, Robert N. M. and Chisnall, David and Davis, Brooks
                  and Koszek, Wojciech and Moore, Simon W. and Murdoch,
                  Steven J. and Neumann, Peter G. and Woodruff, Jonathan},
  title = {{Capability Hardware Enhanced RISC Instructions: CHERI
                   Programmer's Guide}},
  year = 2015,
  month = sep,
  url = {http://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-877.pdf},
  institution = {University of Cambridge, Computer Laboratory},
  number = {UCAM-CL-TR-877},
  abstract = {The CHERI Programmer's Guide documents the software environment for the Capability Hardware Enhanced RISC Instructions (CHERI) prototype developed by SRI International and the
University of Cambridge. The Guide is targeted at hardware and software developers working
with capability-enhanced software. It describes how to use the CHERI Clang/LLVM compiler
suite and CheriBSD operating system -- versions of the off-the-shelf LLVM suite and FreeBSD
operating system adapted to use CHERI’s protection features -- as well as implementation details of both.}
}
@inproceedings{NaylorFMCAD2016,
  author = {M. Naylor and S. W. Moore and A. Mujumdar},
  booktitle = {\textbf{FMCAD 2016}: Formal Methods in Computer-Aided Design},
  conf = {FMCAD 2016},
  title = {A Consistency Checker for Memory Subsystem Traces},
  year = {2016},
  month = sep,
  abstract = {Verifying the memory subsystem in a modern
shared-memory multiprocessor is a big challenge. Optimized
implementations are highly sophisticated, yet must provide subtle
consistency and liveness guarantees for the correct execution of
concurrent programs. We present a tool that supports efficient
specification-based testing of the memory subsystem against a
range of formally specified consistency models. Our tool operates
directly on the memory subsystem interface, promoting a compositional
approach to system-on-chip verification, and can be
used to search for simple failure cases – assisting rapid debug. It
has recently been incorporated into the development flows of two
open-source implementations -- Berkeley’s Rocket Chip (RISCV)
and Cambridge’s BERI (MIPS) -- where it has uncovered a
number of serious bugs.},
  pdf = {http://www.cl.cam.ac.uk/~pes20/rems/papers/fmcad2016-camera-ready.pdf},
  doi = {http://dx.doi.org/10.17863/CAM.4451}
}
@inproceedings{mixed17,
  author = {Shaked Flur and Susmit Sarkar and Christopher Pulte and Kyndylan Nienhuis and Luc Maranget and Kathryn E. Gray and Ali Sezgin and Mark Batty and Peter Sewell},
  title = {Mixed-size Concurrency: {ARM}, {POWER}, {C/C++11}, and {SC}},
  booktitle = {\textbf{POPL 2017}: The 44st Annual {ACM} {SIGPLAN-SIGACT} Symposium on Principles of
               Programming Languages,  Paris, France},
  conf = {POPL 2017},
  year = {2017},
  month = jan,
  pages = {429--442},
  numpages = {14},
  url = {http://doi.acm.org/10.1145/3009837.3009839},
  doi = {10.1145/3009837.3009839},
  abstract = {Previous work on the semantics of relaxed shared-memory concurrency has only considered the case in which each load reads the data of exactly one store.  In practice, however, multiprocessors support mixed-size accesses, and these are used by systems software and (to some degree) exposed at the C/C++ language level.  A semantic foundation for software, therefore, has to address them.


We investigate the mixed-size behaviour of ARMv8 and IBM POWER architectures and implementations: by experiment, by developing semantic models, by testing the correspondence between these, and by discussion with ARM and IBM staff.  This turns out to be surprisingly subtle, and on the way we have to revisit the fundamental concepts of coherence and sequential consistency, which change in this setting.  In particular, we show that adding a memory barrier between each instruction does not restore sequential consistency.  We go on to extend the C/C++11 model to support non-atomic mixed-size memory accesses.


This is a necessary step towards semantics for real-world shared-memory concurrent code, beyond litmus tests.
},
  pdf = {http://www.cl.cam.ac.uk/~pes20/popl17/mixed-size.pdf}
}
@article{Watson2015,
  author = {Robert N.M. Watson and Robert M. Norton and Jonathan Woodruff
  and Simon W. Moore and Peter G. Neumann and Jonathan Anderson and
  David Chisnall and Brooks Davis and Ben Laurie and Michael Roe and
  Nirav H. Dave and Khilan Gudka and Alexandre Joannou and
  A. Theodore Markettos and Ed Maste and Steven J. Murdoch and Colin Rothwell
  and Stacey D. Son and Munraj Vadera},
  title = {Fast Protection-Domain Crossing in the {CHERI} Capability-System
  Architecture},
  journal = {IEEE Micro},
  volume = {36},
  number = {5},
  pages = {38--49},
  month = sep,
  year = 2016,
  doi = {10.1109/MM.2016.84},
  pdf = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=7723791},
  abstract = {Capability Hardware Enhanced RISC Instructions (CHERI) supplement
the conventional memory management unit (MMU) with instruction-set architecture
(ISA) extensions that implement a capability system model in the address space.
CHERI can also underpin a hardware-software object-capability model for
scalable application compartmentalization that can mitigate broader classes of
attack. This article describes ISA additions to CHERI that support fast
protection-domain switching, not only in terms of low cycle count, but also
efficient memory sharing with mutual distrust. The authors propose ISA support
for sealed capabilities, hardware-assisted checking during protection-domain
switching, a lightweight capability flow-control model, and fast register
clearing, while retaining the flexibility of a software-defined
protection-domain transition model. They validate this approach through a
full-system experimental design, including ISA extensions, a field-programmable
gate array prototype (implemented in Bluespec SystemVerilog), and a software
stack including an OS (based on FreeBSD), compiler (based on LLVM), software
compartmentalization model, and open-source applications.}
}
@inproceedings{rems-fmcad16,
  title = {Extracting Behaviour from an Executable Instruction Set Model},
  author = {Brian Campbell and Ian Stark},
  booktitle = {\textbf{FMCAD 2016}: Formal Methods in Computer-Aided Design},
  conf = {FMCAD 2016},
  year = 2016,
  month = oct,
  editor = {Ruzica Piskac and Muralidhar Talupur},
  pages = {33--40},
  isbn = {978-0-9835678-6-8},
  pdf = {http://homepages.inf.ed.ac.uk/bcampbe2/rems/fmcad16.pdf},
  note = {Full proceedings \url{http://www.cs.utexas.edu/users/hunt/FMCAD/FMCAD16/proceedings/fmcad-2016-proceedings.pdf}},
  abstract = {Presenting large formal instruction set models as executable functions makes them accessible to engineers and useful for less formal purposes such as simulation.  However, it is more difficult to extract information about the behaviour of individual instructions for reasoning.  We present a method which combines symbolic evaluation and symbolic execution techniques to provide a rule-based view of instruction behaviour, with particular application to automatic test generation for large MIPS-like models.}
}
@techreport{UCAM-CL-TR-907,
  author = {Watson, Robert N. M. and Neumann, Peter G. and Woodruff,
          	 Jonathan and Roe, Michael and Anderson, Jonathan and
          	 Baldwin, John and Chisnall, David and Davis, Brooks and
          	 Joannou, Alexandre and Laurie, Ben and Moore, Simon W. and
          	 Murdoch, Steven J. and Norton, Robert and Son, Stacey and
          	 Xia, Hongyan},
  title = {{Capability Hardware Enhanced RISC Instructions: CHERI
         	  Instruction-Set Architecture (Version 6)}},
  year = 2017,
  month = apr,
  url = {http://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-907.pdf},
  institution = {University of Cambridge, Computer Laboratory},
  number = {UCAM-CL-TR-907}
}
@inproceedings{Joannou2017Tags,
  author = {Alexandre Joannou and Jonathan Woodruff and Robert Kovacsics and Simon W. Moore and Alex Bradbury and Hongyan Xia and Robert N. M. Watson and David Chisnall and Michael Roe and Brooks Davis and Edward Napierala and John Baldwin and Khilan Gudka and Peter G. Neumann and Alfredo Mazzinghi and Alex Richardson and Stacey Son and A. Theodore Markettos},
  title = {Efficient Tagged Memory},
  booktitle = {\textbf{ICCD 2017}: IEEE 35th International Conference on Computer Design},
  conf = {ICCD 2017},
  year = {2017},
  month = nov,
  url = {},
  doi = {},
  pdf = {http://www.cl.cam.ac.uk/research/security/ctsrd/pdfs/201711-iccd2017-efficient-tags.pdf},
  abstract = {We characterize the cache behavior of an in-memory tag table and demonstrate that an optimized implementation can typically achieve a near-zero memory traffic overhead. Both industry and academia have repeatedly demonstrated tagged memory as a key mechanism to enable enforcement of power- ful security invariants, including capabilities, pointer integrity, watchpoints, and information-flow tracking. A single-bit tag shadowspace is the most commonly proposed requirement, as one bit is the minimum metadata needed to distinguish between an untyped data word and any number of new hardware- enforced types. We survey various tag shadowspace approaches and identify their common requirements and positive features of their implementations. To avoid non-standard memory widths, we identify the most practical implementation for tag storage to be an in-memory table managed next to the DRAM controller. We characterize the caching performance of such a tag table and demonstrate a DRAM traffic overhead below 5\% for the vast majority of applications. We identify spatial locality on a page scale as the primary factor that enables surprisingly high table cache-ability. We then demonstrate tag-table compression for a set of common applications. A hierarchical structure with elegantly simple optimizations reduces DRAM traffic overhead to below 1% for most applications. These insights and optimizations pave the way for commercial applications making use of single-bit tags stored in commodity memory.}
}
@inproceedings{armv8-mca,
  author = {Christopher Pulte and Shaked Flur and Will Deacon and Jon French and Susmit Sarkar and Peter Sewell},
  title = {{Simplifying ARM Concurrency: Multicopy-atomic Axiomatic and Operational Models for ARMv8}},
  booktitle = {\textbf{POPL 2018}},
  conf = {POPL 2018},
  optcrossref = {},
  optkey = {},
  optbooktitle = {},
  optpages = {},
  year = {2018},
  opteditor = {},
  optvolume = {},
  optnumber = {},
  optseries = {},
  optaddress = {},
  month = jan,
  optorganization = {},
  optpublisher = {},
  optnote = {Draft},
  optannote = {},
  pdf = {http://www.cl.cam.ac.uk/~pes20/armv8-mca/armv8-mca-draft.pdf},
  project = {http://www.cl.cam.ac.uk/~pes20/armv8-mca/},
  abstract = {ARM has a relaxed memory model, previously specified in informal prose for ARMv7 and ARMv8.  Over time, and partly due to work building formal semantics for ARM concurrency, it has become clear that some of the complexity of the model is not justified by the potential benefits.  In particular, the model was originally \emph{non-multicopy-atomic}: writes could become visible to some other threads before becoming visible to all --- but this has not been exploited in production implementations, the corresponding potential hardware optimisations are thought to have insufficient benefits in the ARM context, and it gives rise to subtle complications when combined with other ARMv8 features.  The ARMv8 architecture has therefore been revised: it now has a multicopy-atomic model.  It has also been simplified in other respects, including more straightforward notions of dependency, and the architecture now includes a formal concurrency model.

In this paper we detail these changes and discuss their motivation. We define two formal concurrency models: an operational one, simplifying the Flowing model of Flur et al., and the axiomatic model of the revised ARMv8 specification. The models were developed by an academic group and by ARM staff, respectively, and this extended collaboration partly motivated the above changes. We prove the equivalence of the two models. The operational model is integrated into an executable exploration tool with new web interface, demonstrated by exhaustively checking the possible behaviours of a loop-unrolled version of a Linux kernel lock implementation, a previously known bug due to unprevented speculation, and a fixed version.
},
  topic = {Power_and_ARM},
  recent = {true}
}

This file was generated by bibtex2html 1.96.