From a0854e2c75aaf152943964febf67182624f9ca88 Mon Sep 17 00:00:00 2001
From: lemoinep <130471589+lemoinep@users.noreply.github.com>
Date: Tue, 19 Sep 2023 13:26:01 +0000
Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20feelpp/p?=
 =?UTF-8?q?arallel-programming@7e48f746bc62658c7fa6ba740f41344686822b57=20?=
 =?UTF-8?q?=F0=9F=9A=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 parallel-programming/PPChapter1.html          | 1423 +++++++
 parallel-programming/PPChapter2.html          | 3311 +++++++++++++++++
 parallel-programming/PPChapter3.html          |  341 ++
 parallel-programming/PPChapter4.html          | 1313 +++++++
 parallel-programming/PPSummary.html           |  336 ++
 parallel-programming/ParallelProgramming.html |   18 +-
 parallel-programming/antora.html              |   11 +-
 parallel-programming/cmake.html               |   11 +-
 parallel-programming/githubactions.html       |   11 +-
 parallel-programming/index.html               |   11 +-
 parallel-programming/jupyter.html             |   11 +-
 parallel-programming/overview.html            |   11 +-
 parallel-programming/quickstart.html          |   11 +-
 parallel-programming/rename.html              |   11 +-
 parallel-programming/vscode.html              |   11 +-
 sitemap-feelpp-antora-ui.xml                  |   28 +-
 sitemap-parallel-programming.xml              |   40 +-
 17 files changed, 6869 insertions(+), 40 deletions(-)
 create mode 100644 parallel-programming/PPChapter1.html
 create mode 100644 parallel-programming/PPChapter2.html
 create mode 100644 parallel-programming/PPChapter3.html
 create mode 100644 parallel-programming/PPChapter4.html
 create mode 100644 parallel-programming/PPSummary.html
diff --git a/parallel-programming/PPChapter1.html b/parallel-programming/PPChapter1.html
new file mode 100644
index 0000000..b44065d
--- /dev/null
+++ b/parallel-programming/PPChapter1.html
@@ -0,0 +1,1423 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width,initial-scale=1">
+    <title>Untitled :: Parallel Programming</title>
+    <link rel="canonical" href="https://feelpp.github.io/parallel-programming/parallel-programming/PPChapter1.html">
+    <meta name="generator" content="Antora 3.1.3">
+    <link rel="stylesheet" href="../_/css/site.css">
+<link rel="icon" href="../_/img/favicon.ico" type="image/x-icon">
+<script>!function(l,p){if(l.protocol!==p&&l.host=="docs.antora.org"){l.protocol=p}else if(/\.gitlab\.io$/.test(l.host)){l.replace(p+"//docs.antora.org"+l.pathname.substr(l.pathname.indexOf("/",1))+l.search+l.hash)}}(location,"https:")</script>
+
+<script src="../_/js/vendor/tabs-block-extension.js"></script>
+<script src="../_/js/vendor/tabs-block-behavior.js"></script>
+
+
+
+<script type="text/x-mathjax-config">
+MathJax.Hub.Config({
+  messageStyle: "none",
+  tex2jax: {
+    inlineMath: [['$','$'], ['\\(','\\)']],
+    displayMath: [['$$','$$'], ['\\[','\\]']],
+    processEscapes: true,
+    processEnvironments: true,
+    ignoreClass: "nostem|nolatexmath"
+  },
+  asciimath2jax: {
+    delimiters: [["\\$", "\\$"]],
+    ignoreClass: "nostem|noasciimath"
+  },
+
+  TeX: {
+      Macros: {
+      bold: ["{\\bf #1}",1],
+      calTh: "{\\mathcal{T}_h}",
+      card: ["{\\operatorname{card}(#1)}",1],
+      card: ["{\\operatorname{card}(#1)}",1],
+      Ck: ["{\\mathcal{C}^{#1}}",1],
+      deformt: ["{\\mathbf{\\varepsilon(#1)}}",1],
+      diam: "{\\operatorname{diam}}",
+      dim: ["{\\operatorname{dim}(#1)}",1],
+      disp: ["{\\mathbf{#1}}",1],
+      domain: "{\\Omega}",
+      ds: "",
+      essinf: "{\\operatorname{ess}\\, \\operatorname{inf}}",
+      F:"{\\mathcal{F}}",
+      geo: "{\\mathrm{geo}}",
+      Ich: ["{\\mathcal{I}^{#1}_{c,h}#2}",2],
+      Id: "{\\mathcal{I}}",
+      Ilag: ["{\\mathcal{I}^{\\mathrm{lag}}_{#1}}",1],
+      jump: ["{[\\![ #1 ]\\!]}",1],
+      n:"{\\mathbf{n}}",
+      Ne: "{N_{\\mathrm{e}}}",
+      Next: "{\\mathrm{n}}",
+      nf: "{n_f}",
+      ngeo: "{n_{\\mathrm{geo}}}",
+      Nma: "{N_{\\mathrm{ma}}}",
+      NN: "{\\mathbb N}",
+      Nno: "{N_{\\mathrm{no}}}",
+      Nso: "{N_{\\mathrm{so}}}",
+      opdim: "{\\operatorname{dim}}",
+      p: "{\\mathrm{p}}",
+      P:"{\\mathcal{P}}",
+      Pch: ["{P^{#1}_{c,h}}",1],
+      Pcho: ["{P^{#1}_{c,h,0}}",1],
+      Pk: ["{\\mathcal{P}^{#1}}",1],
+      poly: ["{\\mathbb{#1}",1],
+      poly: ["{\\mathbb{#1}}",1],
+      prect: ["{\\left\\(#1\\right\\)}",1],
+      q:"{\\mathbf{q}}",
+      Qch: ["{Q^{#1}_{c,h}}",1],
+      Qk: ["{\\mathcal{Q}^{#1}}",1],
+      R: ["{\\mathbb{R}^{#1}}",1],
+      RR: "{\\mathbb R}",
+      set: ["{\\left\\{#1\\right\\}}",1],
+      stresst: ["{\\mathbf{\\sigma(#1)}}",1],
+      T:"{\\mathcal{T}}",
+      tr: "{\\operatorname{tr}}",
+      v:"{\\mathbf{v}}",
+      vertiii: ["\\left\\vert\\kern-0.25ex\\left\\vert\\kern-0.25ex\\left\\vert #1 \\right\\vert\\kern-0.25ex\\right\\vert\\kern-0.25ex\\right\\vert",1]
+  },
+  extensions: ["mhchem.js"] 
+  }
+});
+</script>
+<!--<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.3/MathJax.js?config=TeX-MML-AM_CHTML"></script>-->
+<!-- <script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-MML-AM_CHTML' async></script> -->
+<script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-AMS_CHTML'></script>
+<!--<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.6.0/MathJax.js?config=TeX-MML-AM_HTMLorMML"></script>-->
+
+<!--<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.css" integrity="sha384-TEMocfGvRuD1rIAacqrknm5BQZ7W7uWitoih+jMNFXQIbNl16bO8OZmylH/Vi/Ei" crossorigin="anonymous">
+<script src="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.js" integrity="sha384-jmxIlussZWB7qCuB+PgKG1uLjjxbVVIayPJwi6cG6Zb4YKq0JIw+OMnkkEC7kYCq" crossorigin="anonymous"></script>-->
+<script>var uiRootPath = '../_'</script>
+
+  </head>
+  <body class="article">
+<header class="header">
+    <nav class="navbar navbar-expand-sm bg-dark navbar-dark navbar-template-project" style="border-top: 4px solid #9E9E9E">
+        <div class="navbar-brand">
+            <div class="navbar-item feelpp-logo">
+                <a href="https://feelpp.github.io/parallel-programming">Parallel Programming</a>
+            </div>
+            <button class="navbar-burger" data-target="topbar-nav">
+                <span></span>
+                <span></span>
+                <span></span>
+            </button>
+        </div>
+
+        <div id="topbar-nav" class="navbar-menu">
+            <div class="navbar-end">
+                <div class="navbar-item">
+                    <a href="https://docs.feelpp.org/">Documentation Reference</a>
+                </div>
+                <div class="navbar-item has-dropdown is-hoverable download-item">
+                    <div class="navbar-item"><a href="https://docs.feelpp.org/user/latest/install/index.html" class="download-btn">Get Feel++</a></div>
+                </div>
+                <div class="navbar-item">
+                    <a class="navbar-brand"  href="https://www.cemosis.fr">
+                        <img class="cemosis-logo"  src="../_/img/cemosis-logo.svg" alt="Cemosis logo"/>
+                    </a>
+                </div>
+            </div>
+        </div>
+    </nav>
+</header>
+<div class="body">
+<a href="#" class="menu-expand-toggle"></a>
+<div class="nav-container" data-component="parallel-programming" data-version="">
+  <aside class="nav">
+    <div class="panels">
+<div class="nav-panel-menu is-active" data-panel="menu">
+  <nav class="nav-menu">
+    <h3 class="title"><a href="index.html">Template Project</a></h3>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="0">
+<ul class="nav-list">
+  <li class="nav-item" data-depth="1">
+    <a class="nav-link" href="index.html">Introduction</a>
+  </li>
+  <li class="nav-item" data-depth="1">
+    <button class="nav-item-toggle"></button>
+    <span class="nav-text">{Parallel Programming} Environment</span>
+<ul class="nav-list">
+  <li class="nav-item is-current-page" data-depth="2">
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="cmake.html">cmake environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="antora.html">antora environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="vscode.html">vscode integration</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="githubactions.html">Github Actions</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="rename.html">Renaming the project</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="jupyter.html">Jupyter Notebook</a>
+  </li>
+</ul>
+  </li>
+</ul>
+  </li>
+</ul>
+  </nav>
+</div>
+<div class="nav-panel-explore" data-panel="explore">
+  <div class="context">
+    <span class="title">Template Project</span>
+    <span class="version"></span>
+  </div>
+  <ul class="components">
+      <li class="component">
+        <a class="title" href="../feelpp-antora-ui/index.html">Antora Feel++ UI</a>
+      </li>
+      <li class="component is-current">
+        <a class="title" href="index.html">Template Project</a>
+      </li>
+  </ul>
+</div>
+    </div>
+  </aside>
+</div>
+<main class="article">
+<div class="toolbar" role="navigation">
+  <button class="nav-toggle"></button>
+    <a href="index.html" class="home-link"></a>
+  <nav class="breadcrumbs" aria-label="breadcrumbs">
+  <ul>
+    <li><a href="index.html">Template Project</a></li>
+    <li>{Parallel Programming} Environment</li>
+    <li><a href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a></li>
+  </ul>
+</nav>
+
+  
+    <div class="edit-this-page"><a href="https://github.com/feelpp/parallel-programming/edit/lem/docs/modules/ROOT/pages/PPChapter1.adoc">Edit this Page</a></div>
+  
+  <div class="page-downloads">
+  <span class="label">Download as</span>
+  <ul class="download-options">
+    <li>
+      <a onclick="print(this)" href="#" data-toggle="tooltip" data-placement="left" title="Print to PDF"
+         class="pdf-download">
+        <img class="pdf-file-icon icon" src="../_/img/pdf.svg"/> .pdf
+      </a>
+    </li>
+  </ul>
+</div>
+</div>
+
+  <div class="content">
+<aside class="toc sidebar" data-title="Contents" data-levels="2">
+  <div class="toc-menu"></div>
+</aside>
+<article class="doc">
+<div class="sect2">
+<h3 id="_1_cpu_gpu_gpgpu_architecture"><a class="anchor" href="#_1_cpu_gpu_gpgpu_architecture"></a>1. 1. CPU, GPU, GPGPU Architecture</h3>
+<div class="paragraph">
+<p>CPU, GPU, and GPGPU architectures are all types of computer processing
+architectures, but they differ in their design and operation.</p>
+</div>
+<div class="paragraph">
+<p>CPU: A central processor (CPU) is a processing unit that is designed to
+perform various computing tasks including data processing, mathematical
+and logical calculations, communication between different components of
+a computer system, etc. Modern CPUs usually have multiple cores to
+process multiple tasks simultaneously.</p>
+</div>
+<div class="paragraph">
+<p>GPU: A graphics processing unit (GPU) is an architecture designed to
+accelerate the processing of images and graphics. GPUs have thousands of
+cores that allow them to process millions of pixels simultaneously,
+making them an ideal choice for video games, 3D modeling, and other
+graphics-intensive applications.</p>
+</div>
+<div class="paragraph">
+<p>GPGPU: A General Processing Architecture (GPGPU) is a type of GPU that
+is designed to be used for purposes other than graphics processing.
+GPGPUs are used to perform computations of an intensive nature using the
+hundreds or thousands of cores available on the graphics card. They are
+particularly effective for parallel computing, machine learning, and
+other computationally intensive areas.</p>
+</div>
+<div class="paragraph">
+<p>In conclusion, the main difference between the three architectures CPU,
+GPU and GPGPU lies in their design and operation. While CPUs are
+designed for general computer processing, GPUs are designed for
+specialized graphics processing, and GPGPUs are a modified version of
+GPUs intended to be used for specialized computer processing other than
+graphics processing.</p>
+</div>
+<div class="sect3">
+<h4 id="_1_1_cpu"><a class="anchor" href="#_1_1_cpu"></a>1.1. 1.1 CPU</h4>
+<div class="paragraph">
+<p>The CPU basically consists of three parts:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>The control unit which searches for instructions in
+memory, decodes them and coordinates the rest of the processor to
+execute them. A basic control unit basically consists of an instruction
+register and a "decoder/sequencer" unit</p>
+</li>
+<li>
+<p>The Arithmetic and Logic Unit executes the
+arithmetic and logic instructions requested by the control unit.
+Instructions can relate to one or more operands. The execution speed is
+optimal when the operands are located in the registers rather than in
+the memory external to the processor.</p>
+</li>
+<li>
+<p>Registers are memory cells internal to the CPU
+They are few in number but very quick to access. They
+are used to store variables, the intermediate results of operations
+(arithmetic or logical) or processor control information.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image1.png" alt="image" width="322" height="220"></span></p>
+</div>
+<div class="paragraph">
+<p>The register structure varies from processor to processor. This is why
+each type of CPU has its own instruction set. Their basic functions are nevertheless similar and all processors have roughly the same categories of registers:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>The <strong>accumulator</strong> is primarily intended to hold the data that needs to
+be processed by the ALU.</p>
+</li>
+<li>
+<p>General registers* are used to store temporary data and intermediate</p>
+</li>
+<li>
+<p>Address registers* are used to construct particular data addresses.
+These are, for example, the base and index registers which allow, among
+other things, to organize the data in memory like indexed tables.</p>
+</li>
+<li>
+<p>The <strong>instruction register</strong> contains the code of the instruction which is processed by the decoder/sequencer.</p>
+</li>
+<li>
+<p>The <strong>ordinal counter</strong> contains the address of the next instruction to be executed. In principle, this register never stops counting. It generates the addresses of the instructions to be executed one after the other. Some instructions sometimes require changing the contents of the ordinal counter to make a sequence break, ie a jump elsewhere in the program.</p>
+</li>
+<li>
+<p>The <strong>status register,</strong> sometimes called <strong>the condition register,</strong>
+contains indicators called <em>flags</em> whose values (0 or 1) vary according
+to the results of the arithmetic and logical operations. These states
+are used by conditional jump instructions.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>The <strong>stack pointer</strong> or <em>stack pointer</em> manages certain data in memory by
+organizing them in the form of stacks.</p>
+</div>
+<div class="paragraph">
+<p><strong>CPU working principle</strong></p>
+</div>
+<div class="paragraph">
+<p>The content of the program counter is deposited on the addressing bus in
+order to search there for a machine code instruction. The control bus
+produces a read signal and the memory, which is selected by the address,
+sends the instruction code back to the processor via the data bus. Once
+the instruction lands in the instruction register, the processor&#8217;s
+control unit decodes it and produces the appropriate sequence of
+internal and external signals that coordinate its execution. An
+instruction comprises a series of elementary tasks. They are clocked by
+clock cycles.</p>
+</div>
+<div class="paragraph">
+<p>All the tasks that constitute an instruction are executed one after the
+other. The execution of an instruction therefore lasts several cycles.
+As it is not always possible to increase the frequency, the only way to
+increase the number of instructions processed in a given time is to seek
+to execute several of them simultaneously. This is achieved by splitting
+processor resources, data and/or processes. This is called the
+parallelization.</p>
+</div>
+<div class="paragraph">
+<p><strong>The different architectures of the processor</strong></p>
+</div>
+<div class="paragraph">
+<p>There is a classification of the <strong>different CPU architectures.</strong> Five in
+number, they are used by programmers depending on the desired results:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p></p>
+<div class="paragraph">
+<p>CISC: very complex addressing;</p>
+</div>
+</li>
+<li>
+<p></p>
+<div class="paragraph">
+<p>RISC: simpler addressing and instructions performed on a single cycle;</p>
+</div>
+</li>
+<li>
+<p></p>
+<div class="paragraph">
+<p>VLIW: long, but simpler instructions;</p>
+</div>
+</li>
+<li>
+<p></p>
+<div class="paragraph">
+<p>vectorial: contrary to the processing in number, the instructions are
+vectorial;</p>
+</div>
+</li>
+<li>
+<p></p>
+<div class="paragraph">
+<p>dataflow: data is active unlike other architectures.</p>
+</div>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>To further improve the <strong>performance of this processor,</strong> developers can
+add so-called SIMD Supplemental Instruction Sets.</p>
+</div>
+<div class="paragraph">
+<p><strong>1. 2 GPU (Graphics Processing Unit is a graphics (co-)processor)</strong></p>
+</div>
+<div class="paragraph">
+<p>Graphics Processing Unit is a graphics (co-)processor capable of very
+efficiently performing calculations on images (2D, 3D, videos, etc.).
+The raw computing power offered is higher due to the large number of
+processors present on these cards. This is why it is not uncommon to
+obtain large acceleration factors between CPU and GPU for the same
+application.</p>
+</div>
+<div class="paragraph">
+<p>Explicit code targeting GPUs: CUDA, HIP, SYCL, Kokkos, RAJA,&#8230;&#8203;</p>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image2.png" alt="image" width="488" height="342"></span></p>
+</div>
+<div class="paragraph">
+<p><em>Fig: illustrates the main hardware architecture differences between
+CPUs and GPUs. The transistor counts associated with various functions
+are represented abstractly by the relative sizes of the various shaded
+areas. In the figure, the green corresponds to the calculation; gold is
+instruction processing; purple is the L1 cache; blue is top level cache
+and orange is memory (DRAM, which really should be thousands of times
+larger than caches).</em></p>
+</div>
+<div class="paragraph">
+<p>GPUs were originally designed to render graphics. They work great for
+shading, texturing, and rendering the thousands of independent polygons
+that make up a 3D object. CPUs, on the other hand, are meant to control
+the logical flow of any general-purpose program, where a lot of digit
+manipulation may (or may not) be involved. Due to these very different
+roles, GPUs are characterized by having many more processing units and
+higher overall memory bandwidth, while CPUs offer more sophisticated
+instruction processing and faster clock speed.</p>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 23%;">
+<col style="width: 44%;">
+<col style="width: 33%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top"></th>
+<th class="tableblock halign-left valign-top"><strong>CPU: Latency-oriented design</strong></th>
+<th class="tableblock halign-left valign-top"><strong>GPU: Throughput Oriented Design</strong></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>Clock</strong></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">High clock frequency</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Moderate clock frequency</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>Caches</strong></p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Large sizes</p>
+</div>
+<div class="paragraph">
+<p>Converts high latency accesses in memory to low latency accesses in
+cache</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Small caches</p>
+</div>
+<div class="paragraph">
+<p>To maximize memory throughput</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>Control</strong></p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Sophisticated control system</p>
+</div>
+<div class="paragraph">
+<p>Branch prediction to reduce latency due to branching<br>
+Data loading to reduce latency due to data access</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Single controlled</p>
+</div>
+<div class="paragraph">
+<p>No branch prediction</p>
+</div>
+<div class="paragraph">
+<p>No data loading</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>Powerful Arithmetic Logic Unit (ALU)</strong></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Reduced operation latency</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Numerous, high latency but heavily pipelined for high throughput</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>Other aspects</strong></p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Lots of space devoted to caching and control logic. Multi-level caches
+used to avoid latency</p>
+</div>
+<div class="paragraph">
+<p>Limited number of registers due to fewer active threads</p>
+</div>
+<div class="paragraph">
+<p>Control logic to reorganize execution, provide ILP, and minimize
+pipeline hangs</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Requires a very large number of threads for latency to be tolerable</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>Beneficial aspects for applications</strong></p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>CPUs for sequential games where latency is critical.</p>
+</div>
+<div class="paragraph">
+<p>CPUs can be 10+X faster than GPUs for sequential code.</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>GPUs for parallel parts where throughput is critical.</p>
+</div>
+<div class="paragraph">
+<p>GPUs can be 10+X faster than GPUs for parallel code.</p>
+</div></div></td>
+</tr>
+</tbody>
+</table>
+<div class="paragraph">
+<p><strong>1.3 GPGPU ( General-Purpose Graphics Processing Unit)</strong></p>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image4.png" alt="image" width="642" height="331"></span></p>
+</div>
+<div class="paragraph">
+<p>A <strong>General-Purpose Graphics Processing Unit</strong> (GPGPU) is a graphics
+processing unit (GPU) that is programmed for purposes beyond graphics
+processing, such as performing computations typically conducted by a
+Central Processing Unit (CPU).</p>
+</div>
+<div class="paragraph">
+<p><em>GPGPU</em> is short for general-purpose computing on graphics processing
+units. Graphics processors or GPUs today are capable of much more than
+calculating pixels in video games. For this, Nvidia has been developing
+for four years a hardware interface and a programming language derived
+from C, CUDA ( <strong>C</strong> ompute <strong>Unified Device Architecture</strong> ). This
+technology, known as <strong>GPGPU</strong> ( <strong>General</strong> - <strong>P</strong> urpose computation on <strong>G</strong>
+raphic <strong>P</strong> rocessing <strong>Units</strong> ) exploits the computing power of GPUs for
+the processing of massively parallel tasks. Unlike the CPU, a GPU is not
+suited for fast processing of tasks that run sequentially. On the other
+hand, it is very suitable for processing parallelizable algorithms.</p>
+</div>
+<div class="paragraph">
+<p>•Array of independent "cores" called calculation units</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>High bandwidth, banked L2 caches and main memory</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>− Banks allow several parallel accesses</p>
+</div>
+<div class="paragraph">
+<p>− 100s of GB/s</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Memory and caches are generally inconsistent</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>Compute units are based on SIMD hardware</p>
+</div>
+<div class="paragraph">
+<p>− Both AMD and NVIDIA have 16-element wide SIMDs</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Large registry files are used for fast context switching</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>− No save/restore state</p>
+</div>
+<div class="paragraph">
+<p>− Data is persistent throughout the execution of the thread</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Both providers have a combination of automatic L1 cache and
+user-managed scratchpad</p>
+</li>
+<li>
+<p>Scratchpad is heavily loaded and has very high bandwidth
+(~terabytes/second)</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>Work items are automatically grouped into hardware threads called
+"wavefronts" (AMD) or "warps" (NVIDIA)</p>
+</div>
+<div class="paragraph">
+<p>− Single instruction stream executed on SIMD hardware</p>
+</div>
+<div class="paragraph">
+<p>− 64 work items in a wavefront, 32 in a string</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>The instruction is issued multiple times on the 16-channel SIMD unit</p>
+</li>
+<li>
+<p>Control flow is managed by masking the SIMD channel</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>NVIDIA coined "Single Instruction Multiple Threads" (SIMT) to refer to
+multiple (software) threads sharing a stream of instructions</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Work items run in sequence on SIMD hardware</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>− Multiple software threads are executed on a single hardware thread</p>
+</div>
+<div class="paragraph">
+<p>− Divergence between managed threads using predication</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Accuracy is transparent to the OpenCL model</p>
+</li>
+<li>
+<p>Performance is highly dependent on understanding work items to SIMD
+mapping</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>1.4 Architecture of a GPU versus CPU</strong></p>
+</div>
+<div class="paragraph">
+<p>Such an architecture is said to be "throughput-oriented". The latest
+from the Santa-Clara firm, codenamed “Fermi” has 512 cores.</p>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image5.png" alt="image" width="530" height="241"></span></p>
+</div>
+<div class="paragraph">
+<p><em>CPU architecture vs. GPUs</em></p>
+</div>
+<div class="paragraph">
+<p>Traditional microprocessors (CPUs) are essentially "low latency
+oriented". The goal is to minimize the execution time of a single
+sequence of a program by reducing latency as much as possible. This
+design takes the traditional assumption that parallelism in the
+operations that the processor must perform is very rare.</p>
+</div>
+<div class="paragraph">
+<p>Throughput-oriented processors assume that their workload requires
+significant parallelism. The idea is not to execute the operations as
+quickly as possible sequentially, but to execute billions of operations
+simultaneously in a given time, the execution time of one of these
+operations is ultimately almost irrelevant. In a video game, for
+example, performance is measured in FPS (Frames Per Seconds). To do
+this, an image, with all the pixels, must be displayed every 30
+milliseconds (approximately). It doesn&#8217;t matter how long a single pixel
+is displayed.</p>
+</div>
+<div class="paragraph">
+<p>This type of processor has small independent calculation units which
+execute the instructions in the order in which they appear in the
+program, there is ultimately little dynamic control over the execution.
+Thea term <strong>SIMD</strong> is used for these processors (<strong>S</strong>ingle <strong>I</strong>nstruction <strong>M</strong>ultiple <strong>Da</strong>ta).</p>
+</div>
+<div class="paragraph">
+<p>Each PU (Processing Unit) does not necessarily correspond to a
+processor, they are calculation units. In this mode, the same
+instruction is applied simultaneously to several data.</p>
+</div>
+<div class="paragraph">
+<p>Less control logic means more space on the chip dedicated to the
+calculation. However, this also comes at a cost. A SIMD execution gets a
+performance peak when parallel tasks follow the same branch of
+execution, which deteriorates when the tasks branch off. Indeed, the
+calculation units assigned to a branch will have to wait for the
+execution of the calculation units of the previous branch. This results
+in hardware underutilization and increased execution time. The
+efficiency of the SIMD architecture depends on the uniformity of the
+workload.</p>
+</div>
+<div class="paragraph">
+<p>However, due to the large number of computational units, it may not be
+very important to have some threads blocked if others can continue their
+execution. Long-latency operations performed on one thread are "hidden"
+by others ready to execute another set of instructions.</p>
+</div>
+<div class="paragraph">
+<p>For a quad or octo-core CPU, the creation of threads and their
+scheduling has a cost. For a GPU, the relative latency "covers" these 2
+steps, making them negligible. However, memory transfers have greater
+implications for a GPU than a CPU because of the need to move data
+between CPU memory and GPU memory.</p>
+</div>
+<div class="paragraph">
+<p>(See:
+<a href="https://blog.octo.com/la-technologie-gpgpu-1ere-partie-le-cote-obscur-de-la-geforce/" class="bare">blog.octo.com/la-technologie-gpgpu-1ere-partie-le-cote-obscur-de-la-geforce/</a>
+)</p>
+</div>
+<div class="paragraph">
+<p><strong>SIMD (Single Instruction Multiple Data)</strong></p>
+</div>
+<div class="paragraph">
+<p>SIMD is a computer technique that allows several data elements to be
+exploited at the same time.</p>
+</div>
+<div class="paragraph">
+<p><strong>What is SIMD used for?</strong></p>
+</div>
+<div class="paragraph">
+<p>SIMD can be used in a wide range of applications, such as 3D graphics,
+signal processing, data mining, and many other processing-intensive
+tasks. In the realm of 3D graphics, SIMD can be used to process large
+amounts of data in parallel, making graphics rendering faster and
+smoother. In signal processing, SIMD can be used to process multiple
+signals at the same time, thereby increasing the efficiency of signal
+processing. In data mining, SIMD can be used to process large volumes of
+data in parallel, which makes data mining faster and more efficient.</p>
+</div>
+<div class="paragraph">
+<p>SIMD is also commonly used in encryption and data compression
+algorithms. These algorithms often require the processing of large
+amounts of data, and SIMD can be used to speed up the process. SIMD can
+also be used to process large amounts of data in parallel in machine
+learning algorithms such as artificial neural networks.</p>
+</div>
+<div class="paragraph">
+<p><strong>Benefits of using SIMD</strong></p>
+</div>
+<div class="paragraph">
+<p>SIMD has several advantages over other forms of parallelization. First,
+SIMD is more efficient than traditional software parallelization
+techniques, such as threading. This is because SIMD takes advantage of
+the capabilities of modern processors and is optimized for parallelism.
+This means that SIMD can process multiple pieces of data in parallel at
+the same time, which greatly improves program performance.</p>
+</div>
+<div class="paragraph">
+<p>In addition, SIMD allows more efficient use of memory. Since the same
+instruction is applied to multiple pieces of data in parallel, the
+amount of memory required to store data is reduced. This can help
+improve performance by reducing the amount of memory required to store
+data items.</p>
+</div>
+<div class="paragraph">
+<p>Finally, SIMD is more flexible than other forms of parallelization. This
+is because SIMD allows the same instruction to be applied to multiple
+data items in parallel, allowing the programmer to customize the code
+according to application requirements.</p>
+</div>
+<div class="paragraph">
+<p><strong>1.5 AMD ROCm Platform, CUDA</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>1.5.1 AMD ROC platform</strong></p>
+</div>
+<div class="paragraph">
+<p>ROCm™ is a collection of drivers , development tools, and APIs that
+enable GPU programming from low-level kernel to end-user applications
+<strong>.</strong> ROCm is powered by AMD&#8217;s Heterogeneous Computing Interface for
+Portability , an OSS C++ GPU programming environment and its
+corresponding runtime environment <strong>.</strong> HIP enables ROCm developers to
+build portable applications across different platforms by deploying code
+on a range of platforms , from dedicated gaming GPUs to exascale HPC
+clusters <strong>.</strong></p>
+</div>
+<div class="paragraph">
+<p>ROCm supports programming models such as OpenMP and OpenCL , and
+includes all necessary compilers , debuggers and OSS libraries <strong>.</strong> ROCm
+is fully integrated with ML frameworks such as PyTorch and TensorFlow
+<strong>.</strong> ROCm can be deployed in several ways , including through the use of
+containers such as Docker , Spack, and your own build from source <strong>.</strong></p>
+</div>
+<div class="paragraph">
+<p>ROCm is designed to help develop , test, and deploy GPU-accelerated HPC
+, AI , scientific computing , CAD, and other applications in a free ,
+open-source , integrated, and secure software ecosystem <strong>.</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>CUDA Platform</strong></p>
+</div>
+<div class="paragraph">
+<p>CUDA® is a parallel computing platform and programming model developed
+by NVIDIA for general computing on graphics processing units (GPUs).
+With CUDA, developers can dramatically speed up computing applications
+by harnessing the power of GPUs.</p>
+</div>
+<div class="paragraph">
+<p>The CUDA architecture is based on a three-level hierarchy of cores,
+threads, and blocks. Cores are the basic unit of computation while
+threads are the individual pieces of work that the cores work on. Blocks
+are collections of threads that are grouped together and can be run
+together. This architecture enables efficient use of GPU resources and
+makes it possible to run multiple applications at once.</p>
+</div>
+<div class="paragraph">
+<p>The NVIDIA CUDA-X platform, which is built on CUDA®, brings together a
+collection of libraries, tools, and technologies that deliver
+significantly higher performance than competing solutions in multiple
+application areas ranging from artificial intelligence to high
+performance computing.</p>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 50%;">
+<col style="width: 50%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top"><strong>GPUs</strong></th>
+<th class="tableblock halign-left valign-top"></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>CUDA ( Compute Unified Device Architecture)</strong></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>HIP
+("Heterogeneous-Compute Interface for Portability")</strong></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Has been the de facto standard for native GPU code for years</p>
+</div>
+<div class="paragraph">
+<p>Huge set of optimized libraries available</p>
+</div>
+<div class="paragraph">
+<p>Custom syntax (extension of C++) supported only by CUDA compilers</p>
+</div>
+<div class="paragraph">
+<p>Support for NVIDIA devices only</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>AMD&#8217;s effort to offer a common programming interface that works on both
+CUDA and ROCm devices</p>
+</div>
+<div class="paragraph">
+<p>Standard C++ syntax, uses the nvcc/hcc compiler in the background</p>
+</div>
+<div class="paragraph">
+<p>Almost an individual CUDA clone from the user&#8217;s perspective</p>
+</div>
+<div class="paragraph">
+<p>The ecosystem is new and growing rapidly</p>
+</div></div></td>
+</tr>
+</tbody>
+</table>
+<div class="paragraph">
+<p><strong>1.5.3 What is the difference between CUDA and ROCm for GPGPU
+applications?</strong></p>
+</div>
+<div class="paragraph">
+<p>NVIDIA&#8217;s CUDA and AMD&#8217;s ROCm provide frameworks to take advantage of the
+respective GPU platforms.</p>
+</div>
+<div class="paragraph">
+<p>Graphics processing units (GPUs) are traditionally designed to handle
+graphics computing tasks, such as image and video processing and
+rendering, 2D and 3D graphics, vectorization, etc. General purpose
+computing on GPUs became more practical and popular after 2001, with the
+advent of programmable shaders and floating point support on graphics
+processors.</p>
+</div>
+<div class="paragraph">
+<p>Notably, it involved problems with matrices and vectors, including two-,
+three-, or four-dimensional vectors. These were easily translated to
+GPU, which acts with native speed and support on these types. A
+milestone for general purpose GPUs (GPGPUs) was the year 2003, when a
+pair of research groups independently discovered GPU-based approaches
+for solving general linear algebra problems on working GPUs faster than
+on CPUs.</p>
+</div>
+<div class="paragraph">
+<p><strong>1.6 GPGPU Evolution</strong></p>
+</div>
+<div class="paragraph">
+<p>Early efforts to use GPUs as general-purpose processors required
+reframing computational problems in terms of graphics primitives, which
+were supported by two major APIs for graphics processors: OpenGL and
+DirectX.</p>
+</div>
+<div class="paragraph">
+<p>These were soon followed by NVIDIA&#8217;s CUDA, which allowed programmers to
+abandon underlying graphics concepts for more common high-performance
+computing concepts, such as OpenCL and other high-end frameworks. This
+meant that modern GPGPU pipelines could take advantage of the speed of a
+GPU without requiring a complete and explicit conversion of the data to
+a graphical form.</p>
+</div>
+<div class="paragraph">
+<p>NVIDIA describes CUDA as a parallel computing platform and application
+programming interface (API) that allows software to use specific GPUs
+for general-purpose processing. CUDA is a software layer that provides
+direct access to the GPU&#8217;s virtual instruction set and parallel
+computing elements for running compute cores.</p>
+</div>
+<div class="paragraph">
+<p>Not to be outdone, AMD launched its own general-purpose computing
+platform in 2016, dubbed the Radeon Open Compute Ecosystem (ROCm). ROCm
+is primarily intended for discrete professional GPUs, such as AMD&#8217;s
+Radeon Pro line. However, official support is more extensive and extends
+to consumer products, including gaming GPUs.</p>
+</div>
+<div class="paragraph">
+<p>Unlike CUDA, the ROCm software stack can take advantage of multiple
+areas, such as general-purpose GPGPU, high-performance computing (HPC),
+and heterogeneous computing. It also offers several programming models,
+such as HIP (GPU kernel-based programming), OpenMP/Message Passing
+Interface (MPI), and OpenCL. These also support microarchitectures,
+including RDNA and CDNA, for a myriad of applications ranging from AI
+and edge computing to IoT/IIoT.</p>
+</div>
+<div class="paragraph">
+<p><strong>NVIDIA&#8217;s CUDA</strong></p>
+</div>
+<div class="paragraph">
+<p>Most of NVIDIA&#8217;s Tesla and RTX series cards come with a series of CUDA
+cores designed to perform multiple calculations at the same time. These
+cores are similar to CPU cores, but they are integrated into the GPU and
+can process data in parallel. There can be thousands of these cores
+embedded in the GPU, making for incredibly efficient parallel systems
+capable of offloading CPU-centric tasks directly to the GPU.</p>
+</div>
+<div class="paragraph">
+<p>Parallel computing is described as the process of breaking down larger
+problems into smaller, independent parts that can be executed
+simultaneously by multiple processors communicating through shared
+memory. These are then combined at the end as part of an overall
+algorithm. The primary purpose of parallel computing is to increase
+available computing power to speed up application processing and problem
+solving.</p>
+</div>
+<div class="paragraph">
+<p>To this end, the CUDA architecture is designed to work with programming
+languages such as C, C++ and Fortran, allowing parallel programmers to
+more easily utilize GPU resources. This contrasts with previous APIs
+such as Direct3D and OpenGL, which required advanced graphics
+programming skills. CUDA-powered GPUs also support programming
+frameworks such as OpenMP, OpenACC, OpenCL, and HIP by compiling this
+code on CUDA.</p>
+</div>
+<div class="paragraph">
+<p>As with most APIs, software development kits (SDKs), and software
+stacks, NVIDIA provides libraries, compiler directives, and extensions
+for the popular programming languages mentioned earlier, making
+programming easier and more effective. These include cuSPARCE, NVRTC
+runtime compilation, GameWorks Physx, MIG multi-instance GPU support,
+cuBLAS and many more.</p>
+</div>
+<div class="paragraph">
+<p>A good portion of these software stacks are designed to handle AI-based
+applications, including machine learning and deep learning, computer
+vision, conversational AI, and recommender systems.</p>
+</div>
+<div class="paragraph">
+<p>Computer vision applications use deep learning to acquire knowledge from
+digital images and videos. Conversational AI applications help computers
+understand and communicate through natural language. Recommender systems
+use a user&#8217;s images, language, and interests to deliver meaningful and
+relevant search results and services.</p>
+</div>
+<div class="paragraph">
+<p>GPU-accelerated deep learning frameworks provide a level of flexibility
+to design and train custom neural networks and provide interfaces for
+commonly used programming languages. All major deep learning frameworks,
+such as TensorFlow, PyTorch, and others, are already GPU-accelerated, so
+data scientists and researchers can upgrade without GPU programming.</p>
+</div>
+<div class="paragraph">
+<p>Current use of the CUDA architecture that goes beyond AI includes
+bioinformatics, distributed computing, simulations, molecular dynamics,
+medical analytics (CTI, MRI and other scanning imaging applications ),
+encryption, etc.</p>
+</div>
+<div class="paragraph">
+<p><strong>AMD&#8217;s ROCm Software Stack</strong></p>
+</div>
+<div class="paragraph">
+<p>AMD&#8217;s ROCm software stack is similar to the CUDA platform, except it&#8217;s
+open source and uses the company&#8217;s GPUs to speed up computational tasks.
+The latest Radeon Pro W6000 and RX6000 series cards are equipped with
+compute cores, ray accelerators (ray tracing) and stream processors that
+take advantage of RDNA architecture for parallel processing, including
+GPGPU, HPC, HIP (CUDA-like programming model), MPI and OpenCL.</p>
+</div>
+<div class="paragraph">
+<p>Since the ROCm ecosystem is composed of open technologies, including
+frameworks (TensorFlow/PyTorch), libraries (MIOpen/Blas/RCCL),
+programming models (HIP), interconnects (OCD), and support upstream
+Linux kernel load, the platform is regularly optimized. for performance
+and efficiency across a wide range of programming languages.</p>
+</div>
+<div class="paragraph">
+<p>AMD&#8217;s ROCm is designed to scale, meaning it supports multi-GPU computing
+in and out of server-node communication via Remote Direct Memory Access
+(RDMA), which offers the ability to directly access host memory without
+CPU intervention. Thus, the more RAM the system has, the greater the
+processing loads that can be handled by ROCm.</p>
+</div>
+<div class="paragraph">
+<p>ROCm also simplifies the stack when the driver directly integrates
+support for RDMA peer synchronization, making application development
+easier. Additionally, it includes ROCr System Runtime, which is language
+independent and leverages the HAS (Heterogeneous System Architecture)
+Runtime API, providing a foundation for running programming languages
+such as HIP and OpenMP.</p>
+</div>
+<div class="paragraph">
+<p>As with CUDA, ROCm is an ideal solution for AI applications, as some
+deep learning frameworks already support a ROCm backend (e.g.
+TensorFlow, PyTorch, MXNet, ONNX, CuPy, etc.). According to AMD, any
+CPU/GPU vendor can take advantage of ROCm, as it is not a proprietary
+technology. This means that code written in CUDA or another platform can
+be ported to vendor-neutral HIP format, and from there users can compile
+code for the ROCm platform.</p>
+</div>
+<div class="paragraph">
+<p>The company offers a series of libraries, add-ons and extensions to
+deepen the functionality of ROCm, including a solution (HCC) for the C++
+programming language that allows users to integrate CPU and GPU in a
+single file.</p>
+</div>
+<div class="paragraph">
+<p>The feature set for ROCm is extensive and incorporates multi-GPU support
+for coarse-grained virtual memory, the ability to handle concurrency and
+preemption, HSA and atomic signals, DMA and queues in user mode. It also
+offers standardized loader and code object formats, dynamic and offline
+compilation support, P2P multi-GPU operation with RDMA support, event
+tracking and collection API, as well as APIs and system management
+tools. On top of that, there is a growing third-party ecosystem that
+bundles custom ROCm distributions for a given application across a host
+of Linux flavors.</p>
+</div>
+<div class="paragraph">
+<p>To further enhance the capability of exascale systems, AMD also
+announced the availability of its open source platform, AMD ROCm, which
+enables researchers to harness the power of AMD Instinct accelerators
+and drive scientific discovery. Built on the foundation of portability,
+the ROCm platform is capable of supporting environments from multiple
+vendors and accelerator architectures.</p>
+</div>
+<div class="paragraph">
+<p>And with ROCm5.0, AMD extends its open platform powering the best HPC
+and AI applications with AMD Instinct MI200 series accelerators,
+increasing ROCm accessibility for developers and delivering
+industry-leading performance on workloads keys. And with AMD Infinity
+Hub, researchers, data scientists, and end users can easily find,
+download, and install containerized HPC applications and ML frameworks
+optimized and supported on AMD Instinct and ROCm.</p>
+</div>
+<div class="paragraph">
+<p>The hub currently offers a range of containers supporting Radeon
+Instinct™ MI50, AMD Instinct™ MI100, or AMD Instinct MI200 accelerators,
+including several applications such as Chroma, CP2k, LAMMPS, NAMD,
+OpenMM, etc., as well as frameworks Popular TensorFlow and PyTorch MLs.
+New containers are continually being added to the hub.</p>
+</div>
+</div>
+</div>
+<div class="sect2">
+<h3 id="_amd_fusion_system_architecture"><a class="anchor" href="#_amd_fusion_system_architecture"></a>2. AMD Fusion System Architecture</h3>
+
+</div>
+<div class="sect2">
+<h3 id="_moves_to_unify_cpus_and_gpus"><a class="anchor" href="#_moves_to_unify_cpus_and_gpus"></a>3. Moves to Unify CPUs and GPUs</h3>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image6.png" alt="image" width="511" height="287"></span></p>
+</div>
+<div class="paragraph">
+<p><strong>1.7 TPU (Tensor Processing Unit) form Google</strong></p>
+</div>
+<div class="paragraph">
+<p>A Tensor Processing Unit (TPU) is a specialized hardware processor
+developed by Google to accelerate machine learning. Unlike traditional
+CPUs or GPUs, TPUs are specifically designed to handle tensor
+operations, which account for most of the computations in deep learning
+models. This makes them incredibly efficient at those tasks and provides
+an enormous speedup compared to CPUs and GPUs. In this article, we’ll
+explore what a TPU is, how it works, and why they are so beneficial for
+machine learning applications.</p>
+</div>
+<div class="paragraph">
+<p><strong>What Are Tensor Processing Units (TPU)?</strong></p>
+</div>
+<div class="paragraph">
+<p>Tensor Processing Unit (TPU) is an application-specific integrated
+circuit (ASIC) designed specifically for machine learning. In addition,
+TPUs offer improved energy efficiency, allowing businesses to reduce
+their electricity bills while still achieving the same results as
+processors with greater energy consumption<strong>.</strong> This makes them an
+attractive option for companies looking to use AI in their products or
+services<strong>.</strong> With the help of TPUs, businesses can develop and deploy
+faster, more efficient models that are better suited to their needs<strong>.</strong>
+TPUs offer a range of advantages over CPUs and GPUs<strong>.</strong> For instance,
+they provide up to 30x faster performsance than traditional processors
+and up to 15x better energy efficiency<strong>.</strong> This makes them ideal for
+companies looking to develop complex models in a fraction of the
+time<strong>.</strong> Finally, TPUs are more affordable than other specialized
+hardware solutions, making them an attractive option for businesses of
+all sizes<strong>.</strong></p>
+</div>
+<div class="paragraph">
+<p>Tensor Processing Units are Google&#8217;s ASIC for machine learning. TPUs are
+specifically used for deep learning to solve complex matrix and vector
+operations. TPUs are streamlined to solve matrix and vector operations
+at ultra-high speeds but must be paired with a CPU to give and execute
+instructions.</p>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image22.png" alt="image" width="544" height="419"></span></p>
+</div>
+<div class="paragraph">
+<p><strong>Applications for TPUs</strong></p>
+</div>
+<div class="paragraph">
+<p>TPUs can be used in various deep learning applications such as fraud
+detection, computer vision, natural language processing, self-driving
+cars, vocal AI, agriculture, virtual assistants, stock trading,
+e-commerce, and various social predictions.s</p>
+</div>
+<div class="paragraph">
+<p><strong><em>When to Use TPUss</em></strong></p>
+</div>
+<div class="paragraph">
+<p>Since TPUs are high specialized hardware for deep learning, it loses a
+lot of other functions you would typically expect from a general-purpose
+processor like a CPU. With this in mind, there are specific scenarios
+where using TPUs will yield the best result when training AI. The best
+time to use a TPU is for operations where models rely heavily on matrix
+computations, like recommendation systems for search engines. TPUs also
+yield great results for models where the AI analyzes massive amounts of
+data points that will take multiple weeks or months to complete. AI
+engineers use TPUs for instances without custom TensorFlow models and
+have to start from scratch.</p>
+</div>
+<div class="paragraph">
+<p><strong><em>When Not to Use TPUs</em></strong></p>
+</div>
+<div class="paragraph">
+<p>As stated earlier, the optimization of TPUs causes these types of
+processors to only work on specific workload operations. Therefore,
+there are instances where opting to use a traditional CPU and GPU will
+yield faster results. These instances include:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Rapid prototyping with maximum flexibility</p>
+</li>
+<li>
+<p>Models limited by the available data points</p>
+</li>
+<li>
+<p>Models that are simple and can be trained quickly</p>
+</li>
+<li>
+<p>Models too onerous to change</p>
+</li>
+<li>
+<p>Models reliant on custom TensorFlow operations written in C++</p>
+</li>
+</ul>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 14%;">
+<col style="width: 86%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top"><strong>TPU Versions and Specifications</strong></th>
+<th class="tableblock halign-left valign-top"></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">TPUv1</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The first publicly announced TPU. Designed as an 8-bit matrix
+multiplication engine and is limited to solving only integers.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">TPUv2:</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Since engineers noted that TPUv1 was limited in bandwidth. This
+version now has double the memory bandwidth with 16GB of RAM. This
+version can now solve floating points making it useful for training and
+inferencing.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">TPUv3</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Released in 2018, TPUv3 has twice the processors and is deployed
+with four times as many chips as TPUv2. The upgrades allow this version
+to have eight times the performance over previous versions.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">TPUv4</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">This is the latest version of TPU announced on May 18, 2021.
+Google&#8217;s CEO announced that this version would have more than twice the
+performance of TPU v3.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Edge TPU</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">This TPU version is meant for smaller operations optimized to
+use less power than other versions of TPU in overall operation. Although
+only using two watts of power, Edge TPU can solve up to four
+terra-operations per second. Edge TPU is only found on small handheld
+devices like Google&#8217;s Pixel 4 smartphone.</p></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 26%;">
+<col style="width: 74%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top"><strong>Benefits of the TPU Architecture</strong></th>
+<th class="tableblock halign-left valign-top"></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">High Performance:</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is designed to maximize
+performance, ensuring that the processor can execute operations at
+extremely high speeds.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Low Power Consumption:</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Compared to CPUs and GPUs, the TPU architecture
+requires significantly less power consumption, making it ideal for
+applications in which energy efficiency is a priority.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Cost Savings:</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is designed to be affordable,
+making it an attractive solution for businesses that are looking to
+reduce their hardware costs.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Scalability</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is highly scalable and can
+accommodate a wide range of workloads, from small applications to
+large-scale projects.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Flexibility</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is flexible and can be adapted to
+meet the needs of different applications, making it suitable for a range
+of use cases.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Efficient Training</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture enables efficient training of
+deep learning models, allowing businesses to quickly iterate and improve
+their AI solutions.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Security</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is highly secure, making it an ideal
+solution for mission-critical applications that require high levels of
+security.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Enhanced Reliability</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture has enhanced reliability,
+providing businesses with the assurance that their hardware will perform
+as expected in any environment.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Easy to Deploy</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is designed for easy deployment,
+allowing businesses to quickly set up and deploy their hardware
+solutions.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Open Source Support</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is backed by an open-source
+community that provides support and assistance when needed, making it
+easier for businesses to get the most out of their hardware investments.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Improved Efficiency</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is designed to optimize
+efficiency, allowing businesses to get the most out of their hardware
+resources and reducing the cost of running AI applications.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">End-to-End Solutions:</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture provides a complete
+end-to-end solution for all types of AI projects, allowing businesses to
+focus on their development and operations instead of worrying about
+hardware compatibility.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Cross-Platform Support</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is designed to work across
+multiple platforms, making it easier for businesses to deploy their AI
+solutions in any environment.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Future Ready</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is designed with the future in mind,
+providing businesses with a solution that will remain up-to-date and
+ready to take on next-generation AI applications.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Industry Standard</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The TPU architecture is becoming an industry
+standard for AI applications, giving businesses the confidence that
+their hardware investments are future-proofed.</p></td>
+</tr>
+</tbody>
+</table>
+<div class="paragraph">
+<p><strong>Applications of the TPU</strong></p>
+</div>
+<div class="paragraph">
+<p>Tensor Processing Units (TPUs) are specialized ASIC chips designed to
+accelerate the performance of machine learning algorithms. They can be
+used in a variety of applications, ranging from cloud computing and edge
+computing to machine learning. TPUs provide an efficient way to process
+data, making them suitable for a range of tasks such as image
+recognition, language processing, and speech recognition. By leveraging
+the power of TPUs, organizations can reduce costs and optimize their
+operations.</p>
+</div>
+<div class="paragraph">
+<p><strong>Cloud Computing:</strong> TPUs are used in cloud computing to provide better
+performance for workloads that require a lot of data processing. This
+allows businesses to process large amounts of data quickly and
+accurately at a lower cost than ever before. With the help of TPUs,
+businesses can make more informed decisions faster and improve their
+operational efficiency.</p>
+</div>
+<div class="paragraph">
+<p><strong>Edge Computing:</strong> TPUs are also used in edge computing applications,
+which involve processing data at or near the source. This helps to
+reduce latency and improve performance for tasks such as streaming audio
+or video, autonomous driving, robotic navigation, and predictive
+analytics. Edge computing also facilitates faster and more reliable
+communication between devices in an IoT network.</p>
+</div>
+<div class="paragraph">
+<p><strong>Machine Learning:</strong> TPUs are used to accelerate machine learning models
+and algorithms. They can be used to develop novel architectures that are
+optimized for tasks such as natural language processing, image
+recognition, and speech recognition. By leveraging the power of TPUs,
+organizations can develop more complex models and algorithms faster.
+This will enable them to achieve better results with their
+machine-learning applications.</p>
+</div>
+</div>
+</article>
+  </div>
+</main>
+</div>
+<footer class="footer" style="border-top: 2px solid #e9e9e9; background-color: #fafafa; padding-bottom: 2em; padding-top: 2em;">
+    <div class="container" style="display: flex; flex-direction: column; align-items: center; gap: 0.5em;">
+        <div>
+            <a href="https://www.cemosis.fr">
+                <img src="../_/img/cemosis-logo.svg" alt="Cemosis logo" height="50">
+            </a>
+        </div>
+        <span style="font-size: 0.8rem; color: #9e9e9e">© 2023 <a href="https://www.cemosis.fr" style="text-decoration: underline;">Cemosis</a>, Université de Strasbourg</span>
+    </div>
+</footer>
+<script id="site-script" src="../_/js/site.js" data-ui-root-path="../_"></script>
+
+
+<script async src="../_/js/vendor/fontawesome-icon-defs.js"></script>
+<script async src="../_/js/vendor/fontawesome.js"></script>
+<script async src="../_/js/vendor/highlight.js"></script>
+
+
+<script type="text/javascript">
+function toggleFullScreen() {
+   var doc = window.document;
+   var docEl = doc.documentElement;
+
+   var requestFullScreen = docEl.requestFullscreen || docEl.mozRequestFullScreen || docEl.webkitRequestFullScreen || docEl.msRequestFullscreen;
+   var cancelFullScreen = doc.exitFullscreen || doc.mozCancelFullScreen || doc.webkitExitFullscreen || doc.msExitFullscreen;
+
+   if(!doc.fullscreenElement && !doc.mozFullScreenElement && !doc.webkitFullscreenElement && !doc.msFullscreenElement) {
+       requestFullScreen.call(docEl);
+   }
+   else {
+       cancelFullScreen.call(doc);
+   }
+}
+</script>
+  </body>
+</html>
diff --git a/parallel-programming/PPChapter2.html b/parallel-programming/PPChapter2.html
new file mode 100644
index 0000000..a9d0bf3
--- /dev/null
+++ b/parallel-programming/PPChapter2.html
@@ -0,0 +1,3311 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width,initial-scale=1">
+    <title>Untitled :: Parallel Programming</title>
+    <link rel="canonical" href="https://feelpp.github.io/parallel-programming/parallel-programming/PPChapter2.html">
+    <meta name="generator" content="Antora 3.1.3">
+    <link rel="stylesheet" href="../_/css/site.css">
+<link rel="icon" href="../_/img/favicon.ico" type="image/x-icon">
+<script>!function(l,p){if(l.protocol!==p&&l.host=="docs.antora.org"){l.protocol=p}else if(/\.gitlab\.io$/.test(l.host)){l.replace(p+"//docs.antora.org"+l.pathname.substr(l.pathname.indexOf("/",1))+l.search+l.hash)}}(location,"https:")</script>
+
+<script src="../_/js/vendor/tabs-block-extension.js"></script>
+<script src="../_/js/vendor/tabs-block-behavior.js"></script>
+
+
+
+<script type="text/x-mathjax-config">
+MathJax.Hub.Config({
+  messageStyle: "none",
+  tex2jax: {
+    inlineMath: [['$','$'], ['\\(','\\)']],
+    displayMath: [['$$','$$'], ['\\[','\\]']],
+    processEscapes: true,
+    processEnvironments: true,
+    ignoreClass: "nostem|nolatexmath"
+  },
+  asciimath2jax: {
+    delimiters: [["\\$", "\\$"]],
+    ignoreClass: "nostem|noasciimath"
+  },
+
+  TeX: {
+      Macros: {
+      bold: ["{\\bf #1}",1],
+      calTh: "{\\mathcal{T}_h}",
+      card: ["{\\operatorname{card}(#1)}",1],
+      card: ["{\\operatorname{card}(#1)}",1],
+      Ck: ["{\\mathcal{C}^{#1}}",1],
+      deformt: ["{\\mathbf{\\varepsilon(#1)}}",1],
+      diam: "{\\operatorname{diam}}",
+      dim: ["{\\operatorname{dim}(#1)}",1],
+      disp: ["{\\mathbf{#1}}",1],
+      domain: "{\\Omega}",
+      ds: "",
+      essinf: "{\\operatorname{ess}\\, \\operatorname{inf}}",
+      F:"{\\mathcal{F}}",
+      geo: "{\\mathrm{geo}}",
+      Ich: ["{\\mathcal{I}^{#1}_{c,h}#2}",2],
+      Id: "{\\mathcal{I}}",
+      Ilag: ["{\\mathcal{I}^{\\mathrm{lag}}_{#1}}",1],
+      jump: ["{[\\![ #1 ]\\!]}",1],
+      n:"{\\mathbf{n}}",
+      Ne: "{N_{\\mathrm{e}}}",
+      Next: "{\\mathrm{n}}",
+      nf: "{n_f}",
+      ngeo: "{n_{\\mathrm{geo}}}",
+      Nma: "{N_{\\mathrm{ma}}}",
+      NN: "{\\mathbb N}",
+      Nno: "{N_{\\mathrm{no}}}",
+      Nso: "{N_{\\mathrm{so}}}",
+      opdim: "{\\operatorname{dim}}",
+      p: "{\\mathrm{p}}",
+      P:"{\\mathcal{P}}",
+      Pch: ["{P^{#1}_{c,h}}",1],
+      Pcho: ["{P^{#1}_{c,h,0}}",1],
+      Pk: ["{\\mathcal{P}^{#1}}",1],
+      poly: ["{\\mathbb{#1}",1],
+      poly: ["{\\mathbb{#1}}",1],
+      prect: ["{\\left\\(#1\\right\\)}",1],
+      q:"{\\mathbf{q}}",
+      Qch: ["{Q^{#1}_{c,h}}",1],
+      Qk: ["{\\mathcal{Q}^{#1}}",1],
+      R: ["{\\mathbb{R}^{#1}}",1],
+      RR: "{\\mathbb R}",
+      set: ["{\\left\\{#1\\right\\}}",1],
+      stresst: ["{\\mathbf{\\sigma(#1)}}",1],
+      T:"{\\mathcal{T}}",
+      tr: "{\\operatorname{tr}}",
+      v:"{\\mathbf{v}}",
+      vertiii: ["\\left\\vert\\kern-0.25ex\\left\\vert\\kern-0.25ex\\left\\vert #1 \\right\\vert\\kern-0.25ex\\right\\vert\\kern-0.25ex\\right\\vert",1]
+  },
+  extensions: ["mhchem.js"] 
+  }
+});
+</script>
+<!--<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.3/MathJax.js?config=TeX-MML-AM_CHTML"></script>-->
+<!-- <script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-MML-AM_CHTML' async></script> -->
+<script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-AMS_CHTML'></script>
+<!--<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.6.0/MathJax.js?config=TeX-MML-AM_HTMLorMML"></script>-->
+
+<!--<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.css" integrity="sha384-TEMocfGvRuD1rIAacqrknm5BQZ7W7uWitoih+jMNFXQIbNl16bO8OZmylH/Vi/Ei" crossorigin="anonymous">
+<script src="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.js" integrity="sha384-jmxIlussZWB7qCuB+PgKG1uLjjxbVVIayPJwi6cG6Zb4YKq0JIw+OMnkkEC7kYCq" crossorigin="anonymous"></script>-->
+<script>var uiRootPath = '../_'</script>
+
+  </head>
+  <body class="article">
+<header class="header">
+    <nav class="navbar navbar-expand-sm bg-dark navbar-dark navbar-template-project" style="border-top: 4px solid #9E9E9E">
+        <div class="navbar-brand">
+            <div class="navbar-item feelpp-logo">
+                <a href="https://feelpp.github.io/parallel-programming">Parallel Programming</a>
+            </div>
+            <button class="navbar-burger" data-target="topbar-nav">
+                <span></span>
+                <span></span>
+                <span></span>
+            </button>
+        </div>
+
+        <div id="topbar-nav" class="navbar-menu">
+            <div class="navbar-end">
+                <div class="navbar-item">
+                    <a href="https://docs.feelpp.org/">Documentation Reference</a>
+                </div>
+                <div class="navbar-item has-dropdown is-hoverable download-item">
+                    <div class="navbar-item"><a href="https://docs.feelpp.org/user/latest/install/index.html" class="download-btn">Get Feel++</a></div>
+                </div>
+                <div class="navbar-item">
+                    <a class="navbar-brand"  href="https://www.cemosis.fr">
+                        <img class="cemosis-logo"  src="../_/img/cemosis-logo.svg" alt="Cemosis logo"/>
+                    </a>
+                </div>
+            </div>
+        </div>
+    </nav>
+</header>
+<div class="body">
+<a href="#" class="menu-expand-toggle"></a>
+<div class="nav-container" data-component="parallel-programming" data-version="">
+  <aside class="nav">
+    <div class="panels">
+<div class="nav-panel-menu is-active" data-panel="menu">
+  <nav class="nav-menu">
+    <h3 class="title"><a href="index.html">Template Project</a></h3>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="0">
+<ul class="nav-list">
+  <li class="nav-item" data-depth="1">
+    <a class="nav-link" href="index.html">Introduction</a>
+  </li>
+  <li class="nav-item" data-depth="1">
+    <button class="nav-item-toggle"></button>
+    <span class="nav-text">{Parallel Programming} Environment</span>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item is-current-page" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="cmake.html">cmake environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="antora.html">antora environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="vscode.html">vscode integration</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="githubactions.html">Github Actions</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="rename.html">Renaming the project</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="jupyter.html">Jupyter Notebook</a>
+  </li>
+</ul>
+  </li>
+</ul>
+  </li>
+</ul>
+  </nav>
+</div>
+<div class="nav-panel-explore" data-panel="explore">
+  <div class="context">
+    <span class="title">Template Project</span>
+    <span class="version"></span>
+  </div>
+  <ul class="components">
+      <li class="component">
+        <a class="title" href="../feelpp-antora-ui/index.html">Antora Feel++ UI</a>
+      </li>
+      <li class="component is-current">
+        <a class="title" href="index.html">Template Project</a>
+      </li>
+  </ul>
+</div>
+    </div>
+  </aside>
+</div>
+<main class="article">
+<div class="toolbar" role="navigation">
+  <button class="nav-toggle"></button>
+    <a href="index.html" class="home-link"></a>
+  <nav class="breadcrumbs" aria-label="breadcrumbs">
+  <ul>
+    <li><a href="index.html">Template Project</a></li>
+    <li>{Parallel Programming} Environment</li>
+    <li><a href="PPChapter2.html">Programming interface for parallel computing</a></li>
+  </ul>
+</nav>
+
+  
+    <div class="edit-this-page"><a href="https://github.com/feelpp/parallel-programming/edit/lem/docs/modules/ROOT/pages/PPChapter2.adoc">Edit this Page</a></div>
+  
+  <div class="page-downloads">
+  <span class="label">Download as</span>
+  <ul class="download-options">
+    <li>
+      <a onclick="print(this)" href="#" data-toggle="tooltip" data-placement="left" title="Print to PDF"
+         class="pdf-download">
+        <img class="pdf-file-icon icon" src="../_/img/pdf.svg"/> .pdf
+      </a>
+    </li>
+  </ul>
+</div>
+</div>
+
+  <div class="content">
+<aside class="toc sidebar" data-title="Contents" data-levels="2">
+  <div class="toc-menu"></div>
+</aside>
+<article class="doc">
+<div class="sect2">
+<h3 id="_2_programming_interface_for_parallel_computing"><a class="anchor" href="#_2_programming_interface_for_parallel_computing"></a>1. 2. Programming interface for parallel computing</h3>
+<div class="paragraph">
+<p><strong>MPI, OpenMP two complementary parallelization models.</strong></p>
+</div>
+<div class="paragraph">
+<p>– MPI is a multi-process model whose mode of communication between the
+processes is <strong>explicit</strong> (communication management is the responsibility
+of the user). MPI is generally used on multiprocessor machines with
+distributed memory. MPI is a library for passing messages between
+processes without sharing.</p>
+</div>
+<div class="paragraph">
+<p>– OpenMP is a multitasking model whose mode of communication between
+tasks is <strong>implicit</strong> (the management of communications is the
+responsibility of the compiler). OpenMP is used on shared-memory
+multiprocessor machines. It focuses on shared memory paradigms. It is a
+language extension for expressing data-parallel operations (usually
+parallelized arrays over loops).</p>
+</div>
+<div class="paragraph">
+<p>Note: on a cluster of independent shared-memory multiprocessor machines
+(nodes), the implementation of a two-level parallelization (MPI, OpenMP)
+in the same program can be a major advantage for the parallel
+performance of the code.</p>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image7.png" alt="image" width="581" height="336"></span></p>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 50%;">
+<col style="width: 50%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>MPI vs. OpenMP</strong></p></td>
+<td class="tableblock halign-left valign-top"></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>MPI pos</strong></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>OpenMP pos</strong></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Portable to a distributed and shared memory machine.</p>
+</div>
+<div class="paragraph">
+<p>Scale beyond a node</p>
+</div>
+<div class="paragraph">
+<p>No data placement issues</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Easy to implement parallelism</p>
+</div>
+<div class="paragraph">
+<p>Implicit communications</p>
+</div>
+<div class="paragraph">
+<p>Low latency, high bandwidth</p>
+</div>
+<div class="paragraph">
+<p>Dynamic Load Balancing</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>MPI negative</strong></p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><strong>OpenMP negative</strong></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Explicit communication</p>
+</div>
+<div class="paragraph">
+<p>High latency, low bandwidth</p>
+</div>
+<div class="paragraph">
+<p>Difficult load balancing</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Only on nodes or shared memory machines</p>
+</div>
+<div class="paragraph">
+<p>Scale on Node</p>
+</div>
+<div class="paragraph">
+<p>Data placement problem</p>
+</div></div></td>
+</tr>
+</tbody>
+</table>
+<div class="paragraph">
+<p><strong>2.1 MPI (Message Passing Interface)</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>Point-to-point communications</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>General notions</strong></p>
+</div>
+<div class="paragraph">
+<p>The transmitter and the receiver are identified by their rank in the
+communicator. The entity passed between two processes is called a
+message .<br>
+A message is characterized by its envelope . This consists of:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>the rank of the sending process;<br></p>
+</li>
+<li>
+<p>the rank of the receiving process;<br></p>
+</li>
+<li>
+<p>the label ( <em>tag</em> ) of the message;<br></p>
+</li>
+<li>
+<p>the communicator who defines the process group and the communication
+context.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>The data exchanged is typed (integers, reals, etc. or personal derived
+types).</p>
+</div>
+<div class="paragraph">
+<p>In each case, there are several transfer modes , using different
+protocols.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Send( *const void* *message, *int* length, MPI_Datatype
+type_message, *int* rank_dest, *int* label, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Recv ( *void* *message, *int* length, MPI_Datatype
+type_message, *int* rank_source, *int* label, MPI_Comm comm, MPI_Status
+*status)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Note this operation is blocking.</p>
+</div>
+<div class="paragraph">
+<p><strong>Simultaneous send and receive operation</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Sendrecv ( *const void* *message_sent, *int*
+length_message_sent, +
+MPI_Datatype type_message_sent, *int* rank_dest, *int*
+label_message_sent, *void* *message_received , *int*
+length_message_received, +
+MPI_Datatype type_message_received, *int* rank_source, *int*
+label_message_received, MPI_Comm comm, MPI_Status *status)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Simultaneous send and receive operation</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Sendrecv_replace ( void * message, int length, MPI_Datatype
+type_message, int rank_dest, int label_message_sent, int* rank_source,
+int label_message_recu, MPI_Comm comm, MPI_Status *status)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Collective communications</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>General notions</strong></p>
+</div>
+<div class="paragraph">
+<p>Collective communications allow a series of point-to-point
+communications to be made in a single operation.</p>
+</div>
+<div class="paragraph">
+<p>A collective communication always concerns all the processes of the
+indicated communicator .</p>
+</div>
+<div class="paragraph">
+<p>For each of the processes, the call ends when the latter&#8217;s participation
+in the collective operation is completed, in the sense of point-to-point
+communications (thus when the memory zone concerned can be modified).</p>
+</div>
+<div class="paragraph">
+<p>The management of labels in these communications is transparent and at
+the expense of the system. They are therefore never explicitly defined
+during the call to these subroutines. One of the advantages of this is
+that collective communications never interfere with point-to-point
+communications.</p>
+</div>
+<div class="paragraph">
+<p><strong>Types of collective communications</strong></p>
+</div>
+<div class="paragraph">
+<p>There are three types of subroutines:<br>
+<strong>1.</strong> the one that ensures global synchronizations: MPI_Barrier() .</p>
+</div>
+<div class="paragraph">
+<p><strong>2.</strong> those that only transfer data:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>global data broadcasting: MPI_Bcast();<br></p>
+</li>
+<li>
+<p>selective diffusion of data: MPI_Scatter();<br></p>
+</li>
+<li>
+<p>distributed data collection: MPI_Gather();<br></p>
+</li>
+<li>
+<p>collection by all distributed data processes: MPI_Allgather(); •
+selective collection and dissemination, by all processes, of distributed
+data: MPI_Alltoall() .</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>3.</strong> those who, in addition to managing communications, perform
+operations on the transferred data:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p></p>
+<div class="paragraph">
+<p>reduction operations (sum, product, maximum, minimum, etc.), whether of
+a predefined type or of a personal type: MPI_Reduce();</p>
+</div>
+</li>
+<li>
+<p></p>
+<div class="paragraph">
+<p>reduction operations with distribution of the result (equivalent to an
+MPI_Reduce() followed by an MPI_Bcast()): MPI_Allreduce().</p>
+</div>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Global synchronization</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Barrier ( MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>General distribution</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Bcast( void *message, int length, MPI_Datatype,
+type_message, *int* rank_source, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Selective dissemination</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Scatter ( const void *message_to_be restarted, int
+length_message_sent, MPI_Datatype type_message_sent, void
+*message_received, int length_message_recu, MPI_Datatype type_message_recu, int
+rank_source, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Collection</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Gather ( const void *message_sent, int
+length_message_sent, MPI_Datatype type_message_sent, void
+*message_received, int length_message_received, MPI_Datatype
+type_message_received, *int* rank_dest, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>General collection</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Allgather ( const void *message_sent, int
+length_message_sent, MPI_Datatype type_message_sent, void
+*message_received, int length_message_received, MPI_Datatype
+type_message_received, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>"Variable" collection</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Gatherv ( const void *message_sent, int
+length_message_sent, MPI_Datatype type_message_sent, void
+*message_received, const int *nb_elts_recus, const int *deplts,
+MPI_Datatype type_message_recu, *int* rang_dest, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Selective collections and distributions</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Alltoall ( const void *message_sent, int
+length_message_sent, MPI_Datatype type_message_sent, void
+*message_received, int length_message_received, MPI_Datatype
+type_message_received, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Distributed reductions</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Reduce ( const void *message_sent, void *message_received,
+int length, MPI_Datatype type_message, MPI_Op operation, int rank_dest,*
+MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Distributed reductions with distribution of the result</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Allreduce ( const void *message_sent, void *message_received, *int* length, MPI_Datatype, type_message, MPI_Op operation, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Communication models</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>Point-to-point sending modes</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_Blocking and Non-blocking mode_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Standard sending MPI_Send() MPI_Isend()</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Synchronous send MPI_Ssend() MPI_Issend()</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_Buffered_ send MPI_Bsend() MPI_Ibsend()</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Receive MPI_Recv() MPI_Irecv()</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong><em>Blocking calls</em></strong></p>
+</div>
+<div class="paragraph">
+<p>A call is blocking if the memory space used for communication can be
+reused immediately after the call exits.</p>
+</div>
+<div class="paragraph">
+<p>The data sent can be modified after the blocking call.</p>
+</div>
+<div class="paragraph">
+<p>The received data can be read after the blocking call.</p>
+</div>
+<div class="paragraph">
+<p><strong>Synchronous sends</strong></p>
+</div>
+<div class="paragraph">
+<p>A synchronous send involves synchronization between the processes
+involved. A shipment can only begin when its receipt is posted. There
+can only be communication if both processes are willing to communicate.</p>
+</div>
+<div class="paragraph">
+<p><strong>int</strong> MPI_Ssend( <strong>const void</strong> * values, <strong>int</strong> size, MPI_Datatype
+message_type, <strong>int</strong> dest, <strong>int</strong> label, MPI_Comm comm)</p>
+</div>
+<div class="paragraph">
+<p><strong>Benefits</strong></p>
+</div>
+<div class="paragraph">
+<p>Consume few resources (no <em>buffer</em> )<br>
+Fast if the receiver is ready (no copying into a <em>buffer</em> ) Recognition
+of reception thanks to synchronization</p>
+</div>
+<div class="paragraph">
+<p><strong>Disadvantages</strong></p>
+</div>
+<div class="paragraph">
+<p>Waiting time if the receiver is not there/not ready Risks of deadlock</p>
+</div>
+<div class="paragraph">
+<p><strong>_Buffered<br>
+_</strong>sends A buffered send involves the copying of data into an
+intermediate memory space. There is then no coupling between the two
+communication processes. The output of this type of sending therefore
+does not mean that the reception has taken place.</p>
+</div>
+<div class="paragraph">
+<p>Buffers must be managed manually (with calls to MPI_Buffer_attach( <em>)</em>
+and MPI_Buffer_detach()). They must be allocated taking into account the
+memory overhead of the messages (by adding the MPI_BSEND_OVERHEAD
+constant for each message instance).</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Buffer_attach ( void *buf, int size_buf)
+int MPI_Buffer_detach ( void *buf, int size_buf)
+int MPI_Bsend( const void *values, int size, MPI_Datatype type_message, int dest, int label, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Advantages of buffered mode</strong></p>
+</div>
+<div class="paragraph">
+<p>No need to wait for the receiver (recopy in a <em>buffer</em> ) No risk of
+blocking ( <em>deadlocks</em> )</p>
+</div>
+<div class="paragraph">
+<p><strong>Disadvantages of buffered mode</strong></p>
+</div>
+<div class="paragraph">
+<p>Consume more resources (memory occupation by <em>buffers</em> with risk of
+saturation)</p>
+</div>
+<div class="paragraph">
+<p>Send buffers must be managed manually (often difficult to choose an
+appropriate size <em>)</em></p>
+</div>
+<div class="paragraph">
+<p>A bit slower than synchronous sends if the receiver is ready</p>
+</div>
+<div class="paragraph">
+<p>No knowledge of the reception (send-receive decoupling)</p>
+</div>
+<div class="paragraph">
+<p>Risk of wasting memory space if the <em>buffers</em> are too oversized</p>
+</div>
+<div class="paragraph">
+<p>The application crashes if the <em>buffers</em> are too small</p>
+</div>
+<div class="paragraph">
+<p>There are also often hidden <em>buffers</em> managed by the MPI implementation
+on the sender and/or receiver side (and consuming memory resources)</p>
+</div>
+<div class="paragraph">
+<p><strong>Standard shipments</strong></p>
+</div>
+<div class="paragraph">
+<p>MPI_Send() subroutine . In most implementations, this mode switches from
+buffered <em>(</em> eager <em>)</em> to synchronous mode as message sizes grow.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Send( const void *values, int size, MPI_Datatype type_message, int dest, int label, MPI_Comm comm)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Benefits of standard mode</strong></p>
+</div>
+<div class="paragraph">
+<p>&#8658; Often the most efficient (choice of the most suitable mode by the
+manufacturer)</p>
+</div>
+<div class="paragraph">
+<p><strong>Disadvantages of standard mode</strong></p>
+</div>
+<div class="paragraph">
+<p>&#8658; Little control over the mode actually used (often accessible via
+environment variables)</p>
+</div>
+<div class="paragraph">
+<p>Risk of <em>deadlock</em> depending on the real mode<br>
+Behavior may vary depending on the architecture and the size of the
+problem</p>
+</div>
+<div class="paragraph">
+<p><strong>Non-blocking calls</strong></p>
+</div>
+<div class="paragraph">
+<p>non-blocking call returns control very quickly, but does not allow the
+immediate reuse of the memory space used in the call. It is necessary to
+ensure that the communication is indeed terminated (with MPI_Wait() for
+example) before using it again.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Isend( const void *values, int size, MPI_Datatype
+message_type, int dest, int label, MPI_Comm comm, MPI_Request *req)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Issend ( const void* values, int size, MPI_Datatype
+message_type, int dest, int label, MPI_Comm comm, MPI_Request *req)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Ibsend( const void* values, int size, MPI_Datatype
+message_type, int dest, int label, MPI_Comm comm, MPI_Request *req)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Irecv( void *values, int size, MPI_Datatype type_message,
+int* source, int label, MPI_Comm comm, MPI_Request *req)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Benefits of non-blocking calls</strong></p>
+</div>
+<div class="paragraph">
+<p>Ability to hide all or part of the communication costs (if the
+architecture allows it)</p>
+</div>
+<div class="paragraph">
+<p>No risk of <em>deadlock</em></p>
+</div>
+<div class="paragraph">
+<p><strong>Disadvantages of non-blocking calls</strong></p>
+</div>
+<div class="paragraph">
+<p>Higher additional costs (several calls for a single send or receive,
+request management)</p>
+</div>
+<div class="paragraph">
+<p>Higher complexity and more complicated maintenance</p>
+</div>
+<div class="paragraph">
+<p>Risk of loss of performance on the calculation cores (for example
+differentiated management between the zone close to the border of a
+domain and the interior zone resulting in less good use of memory
+caches)</p>
+</div>
+<div class="paragraph">
+<p>Limited to point-to-point communications (has been extended to
+collectives in MPI 3.0)</p>
+</div>
+<div class="paragraph">
+<p><strong>interfaces</strong></p>
+</div>
+<div class="paragraph">
+<p>MPI_Wait() waits for the end of a communication. MPI_Test() is the
+non-blocking version.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Wait ( MPI_Request *req, MPI_Status *status)
+int MPI_Test( MPI_Request *req, int *flag, MPI_Status *status)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>MPI_Waitall() waits for all communications to end. MPI_Testall() is the
+non-blocking version.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Waitall ( int size, MPI_Request reqs[], MPI_Status statuses[])
+int* MPI_Testall ( int size, MPI_Request reqs[], int *flag, MPI_Status statuses[])</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>MPI_Waitany waits for the end of one communication among several.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Waitany ( int size, MPI_Request reqs[], int *index,MPI_Status *status)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>MPI_Testany is the non-blocking version.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int* MPI_Testany( int size, MPI_Request reqs[], int *index, int *flag, MPI_Status *status)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>MPI_Waitsome is waiting for the end of one or more communications.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Waitsome( int size, MPI_Request reqs[], int *endcount,int *indexes, MPI_Status *status)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>MPI_Testsome is the non-blocking version.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int MPI_Testsome( int size, MPI_Request reqs[], int *endcount,int *indexes, MPI_Status *status)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Memory-to-memory communications (RMA)</strong></p>
+</div>
+<div class="paragraph">
+<p>Memory-to-memory communications (or RMA for <em>Remote Memory Access</em> or
+<em>one-sided communications</em> ) consist of accessing the memory of a remote
+process in write or read mode without the latter having to manage this
+access explicitly. The target process therefore does not intervene
+during the transfer.</p>
+</div>
+<div class="paragraph">
+<p><strong>RMA - General Approach</strong></p>
+</div>
+<div class="paragraph">
+<p>Creation of a memory window with MPI_Win_create() to authorize RMA
+transfers in this area.</p>
+</div>
+<div class="paragraph">
+<p>Remote read or write access by calling MPI_Put(), MPI_Get(),
+MPI_Accumulate(), , MPI_Get_accumulate() and MPI_Compare_and_swap()</p>
+</div>
+<div class="paragraph">
+<p>Freeing the memory window with M PI_Win_free() .</p>
+</div>
+<div class="paragraph">
+<p><strong>RMA - Synchronization Methods</strong></p>
+</div>
+<div class="paragraph">
+<p>To ensure correct operation, it is mandatory to carry out certain
+synchronizations. 3 methods are available:</p>
+</div>
+<div class="paragraph">
+<p>Active target communication with global synchronization (
+MPI_Win_fence() );</p>
+</div>
+<div class="paragraph">
+<p>Communication with active target with pair synchronization
+(MPI_Win_start() and MPI_Win_complete() for the origin process;
+MPI_Win-post() and MPI_Win_wait() for the target process);</p>
+</div>
+<div class="paragraph">
+<p>Passive target communication without target intervention (MPI_Win_lock()
+and MPI_Win_unlock()).</p>
+</div>
+<div class="paragraph">
+<p><strong>Benefits of RMAs</strong></p>
+</div>
+<div class="paragraph">
+<p>Allows you to implement certain algorithms more efficiently.</p>
+</div>
+<div class="paragraph">
+<p>More efficient than point-to-point communications on some machines (use
+of specialized hardware such as DMA engine, coprocessor, specialized
+memory, etc.).</p>
+</div>
+<div class="paragraph">
+<p>Ability for the implementation to group multiple operations.</p>
+</div>
+<div class="paragraph">
+<p><strong>Disadvantages of RMAs</strong></p>
+</div>
+<div class="paragraph">
+<p>Synchronization management is tricky.</p>
+</div>
+<div class="paragraph">
+<p>Complexity and high risk of error.</p>
+</div>
+<div class="paragraph">
+<p>For passive target synchronizations, obligation to allocate memory with
+MPI_Alloc_mem() which does not respect the Fortran standard (use of Cray
+pointers not supported by some compilers).</p>
+</div>
+<div class="paragraph">
+<p>Less efficient than point-to-point communications on some machines.</p>
+</div>
+<div class="paragraph">
+<p><strong>Derived data types</strong></p>
+</div>
+<div class="paragraph">
+<p>In the communications, the data exchanged are typed: MPI_INTEGER,
+MPI_REAL, MPI_COMPLEX, etc .</p>
+</div>
+<div class="paragraph">
+<p>More complex data structures can be created using subroutines such as
+MPI_Type_contiguous(), MPI_Type_vector(), MPI_Type_Indexed() , or
+MPI_Type_create_struct()</p>
+</div>
+<div class="paragraph">
+<p>The derived types notably allow the exchange of non-contiguous or
+non-homogeneous data in memory and to limit the number of calls to the
+communications subroutines.</p>
+</div>
+<div class="paragraph">
+<p><strong>MPI keywords</strong></p>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 50%;">
+<col style="width: 50%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>1 environment</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI Init: Initialization of the MPI environment</p>
+</li>
+<li>
+<p>MPI Comm rank: Rank of the process</p>
+</li>
+<li>
+<p>MPI Comm size: Number of processes</p>
+</li>
+<li>
+<p>MPI Finalize: Deactivation of the MPI environment</p>
+</li>
+<li>
+<p>MPI Abort:Stopping of an MPI program</p>
+</li>
+<li>
+<p>MPI Wtime: Time taking</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>2 Point-to-point communications</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI Send: Send message</p>
+</li>
+<li>
+<p>MPI Isend: Non-blocking message sending</p>
+</li>
+<li>
+<p>MPI Recv: Message received</p>
+</li>
+<li>
+<p>MPI Irecv: Non-blocking message reception</p>
+</li>
+<li>
+<p>MPI Sendrecv and MPI Sendrecv replace: Sending and receiving messages</p>
+</li>
+<li>
+<p>MPI Wait: Waiting for the end of a non-blocking communication</p>
+</li>
+<li>
+<p>MPI Wait all: Wait for the end of all non-blocking communications</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>3 Collective communications</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI Bcast: General broadcast</p>
+</li>
+<li>
+<p>MPI Scatter: Selective spread</p>
+</li>
+<li>
+<p>MPI Gather and MPI Allgather: Collecting</p>
+</li>
+<li>
+<p>MPI Alltoall: Collection and distribution</p>
+</li>
+<li>
+<p>MPI Reduce and MPI Allreduce: Reduction</p>
+</li>
+<li>
+<p>MPI Barrier: Global synchronization</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>4 Derived Types</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI Contiguous type: Contiguous types</p>
+</li>
+<li>
+<p>MPI Type vector and MPI Type create hvector: Types with a con-standing</p>
+</li>
+<li>
+<p>MPI Type indexed: Variable pitch types</p>
+</li>
+<li>
+<p>MPI Type create subarray: Sub-array types</p>
+</li>
+<li>
+<p>MPI Type create struct: H and erogenous types</p>
+</li>
+<li>
+<p>MPI Type commit: Type commit</p>
+</li>
+<li>
+<p>MPI Type get extent: Recover the extent</p>
+</li>
+<li>
+<p>MPI Type create resized: Change of scope</p>
+</li>
+<li>
+<p>MPI Type size: Size of a type</p>
+</li>
+<li>
+<p>MPI Type free: Release of a type</p>
+</li>
+</ul>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>5 Communicator</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI Comm split: Partitioning of a communicator</p>
+</li>
+<li>
+<p>MPI Dims create: Distribution of processes</p>
+</li>
+<li>
+<p>MPI Cart create: Creation of a Cart ́esian topology</p>
+</li>
+<li>
+<p>MPI Cart rank: Rank of a process in the Cart ́esian topology</p>
+</li>
+<li>
+<p>MPI Cart coordinates: Coordinates of a process in the Cart ́esian
+topology</p>
+</li>
+<li>
+<p>MPI Cart shift: Rank of the neighbors in the Cart ́esian topology</p>
+</li>
+<li>
+<p>MPI Comm free: Release of a communicator</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>6 MPI-IO</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI File open: Opening a file</p>
+</li>
+<li>
+<p>MPI File set view: Changing the view • MPI File close: Closing a file</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>6.1 Explicit addresses</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI File read at: Reading</p>
+</li>
+<li>
+<p>MPI File read at all: Collective reading</p>
+</li>
+<li>
+<p>MPI File write at: Writing</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>6.2 Individual pointers</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI File read: Reading</p>
+</li>
+<li>
+<p>MPI File read all: collective reading</p>
+</li>
+<li>
+<p>MPI File write: Writing</p>
+</li>
+<li>
+<p>MPI File write all: collective writing</p>
+</li>
+<li>
+<p>MPI File seek: Pointer positioning</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>6.3 Shared pointers</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI File read shared: Read</p>
+</li>
+<li>
+<p>MPI File read ordered: Collective reading</p>
+</li>
+<li>
+<p>MPI File seek shared: Pointer positioning</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>7.0 Symbolic constants</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>MPI COMM WORLD, MPI SUCCESS</p>
+</li>
+<li>
+<p>MPI STATUS IGNORE, MPI PROC NULL</p>
+</li>
+<li>
+<p>MPI INTEGER, MPI REAL, MPI DOUBLE PRECISION</p>
+</li>
+<li>
+<p>MPI ORDER FORTRAN, MPI ORDER C</p>
+</li>
+<li>
+<p>MPI MODE CREATE,MPI MODE RONLY,MPI MODE WRONLY</p>
+</li>
+</ul>
+</div></div></td>
+</tr>
+</tbody>
+</table>
+<div class="paragraph">
+<p><strong>2.2 OpenMP (Open Multi-Processing)</strong></p>
+</div>
+<div class="paragraph">
+<p>OpenMP ( Open Multi-Processing ) is a programming interface for parallel
+computing on shared memory architecture.</p>
+</div>
+<div class="paragraph">
+<p>It allows you to manage:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p></p>
+<div class="paragraph">
+<p>the creation of light processes,</p>
+</div>
+</li>
+<li>
+<p></p>
+<div class="paragraph">
+<p>the sharing of work between these lightweight processes,</p>
+</div>
+</li>
+<li>
+<p></p>
+<div class="paragraph">
+<p>synchronizations (explicit or implicit) between all light processes,</p>
+</div>
+</li>
+<li>
+<p></p>
+<div class="paragraph">
+<p>the status of the variables (private or shared).</p>
+</div>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>General concepts</strong></p>
+</div>
+<div class="paragraph">
+<p>An OpenMP program is executed by a single process.</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>This process activates lightweight processes (threads) at the entrance
+to a parallel region.<br></p>
+</li>
+<li>
+<p>Each thread performs a task consisting of a set of instructions.<br></p>
+</li>
+<li>
+<p>During the execution of a task, a variable can be read and/or modified
+in memory.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>– It can be defined in the stack (local memory space) of a lightweight
+process; we then speak of a private variable</p>
+</div>
+<div class="paragraph">
+<p>– It can be defined in a shared memory space</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>An OpenMP program is an alternation of sequential regions and parallel
+regions.<br></p>
+</li>
+<li>
+<p>A sequential region is always executed by the master task, the one
+whose rank is 0.<br></p>
+</li>
+<li>
+<p>A parallel region can be executed by several tasks at the same time.<br></p>
+</li>
+<li>
+<p>The tasks can share the work contained in the parallel region.</p>
+</li>
+<li>
+<p>Work sharing essentially consists of:</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>– execute a loop by distributing the iterations between the tasks;<br>
+– execute several sections of code but only one per task;<br>
+– execute several occurrences of the same procedure by different tasks
+(orphaning)</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>It is sometimes necessary to introduce a synchronization between the
+concurrent tasks to avoid, for example, that these modify in any order
+the value of the same shared variable (case of reduction operations).</p>
+</li>
+<li>
+<p>Generally, tasks are assigned to processors by the operating system.
+Different cases can occur:</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>– at best, at each instant, there is one task per processor with as many
+tasks as there are dedicated processors for the duration of the work;<br>
+– at worst, all tasks are processed sequentially by one and only one
+processor;<br>
+– in reality, for reasons essentially of operation on a machine whose
+processors are not dedicated, the situation is generally intermediate.</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>To overcome these problems, it is possible to build the OpenMP runtime
+on a library of mixed threads and thus control the scheduling of tasks.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Construction of a parallel region</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>In a parallel region, by default, the status of variables is shared.</p>
+</li>
+<li>
+<p>Within a single parallel region, all concurrent tasks execute the same
+code.</p>
+</li>
+<li>
+<p>There is an implicit synchronization barrier at the end of the
+parallel region.</p>
+</li>
+<li>
+<p>“Branching” (eg GOTO, CYCLE, etc.) into or out of a parallel region or
+any other OpenMP construct is prohibited.</p>
+</li>
+<li>
+<p>It is possible, thanks to the DEFAULT clause, to change the default
+status of variables in a parallel region.</p>
+</li>
+<li>
+<p>If a variable has a private status (PRIVATE), it is in the stack of
+each task. Its value is then undefined at the entry of a parallel region
+(in the example opposite, the variable a equals 0 at the entry of the
+parallel region)</p>
+</li>
+<li>
+<p>However, thanks to the FIRSTPRIVATE clause, it is possible to force
+the initialization of this private variable to the last value it had
+before entering the parallel region.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Extent of a parallel region</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>The scope of an OpenMP construct represents the scope of its influence
+in the program.<br>
+The influence (or scope) of a parallel region extends both to the code
+contained lexically in this region (static scope), and to the code of
+the called subroutines. The union of the two represents “dynamic
+extent”.</p>
+</li>
+<li>
+<p>In a subroutine called in a parallel region, the local and automatic
+variables are implicitly private to each of the tasks (they are defined
+in the stack of each task).</p>
+</li>
+<li>
+<p>In a procedure, all the variables passed by argument (dummy
+parameters) by reference, inherit the status defined in the lexical
+scope (static) of the region.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Case of static variables</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>A variable is static if its location in memory is defined at
+declaration by the compiler</p>
+</li>
+<li>
+<p>Using the THREADPRIVATE directive allows you to privatize a static
+instance and make it persistent from one parallel region to another. (
+omp_get_thread_num(); )</p>
+</li>
+<li>
+<p>If, in addition, the COPYIN clause is specified then the value of
+static instances is passed to all tasks.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Case of dynamic allocation</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>The dynamic memory allocation/deallocation operation can be performed
+inside a parallel region.</p>
+</li>
+<li>
+<p>If the operation relates to a private variable, it will be local to
+each task.</p>
+</li>
+<li>
+<p>If the operation concerns a shared variable, then it is more prudent
+that only one task (e.g. the master task) takes care of this operation</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Complements</strong></p>
+</div>
+<div class="paragraph">
+<p>The construction of a parallel region admits two other clauses:</p>
+</div>
+<div class="paragraph">
+<p>– REDUCTION: for reduction operations with implicit synchronization
+between tasks;<br>
+– NUM_THREADS: it allows to specify the desired number of tasks at the
+entrance of a parallel region in the same way as the OMP_SET_NUM_THREADS
+subroutine would do.</p>
+</div>
+<div class="paragraph">
+<p>From one parallel region to another, the number of concurrent tasks can
+be varied if desired. To do this, simply use the OMP_SET_DYNAMIC
+subroutine or set the OMP_DYNAMIC environment variable to true. It is
+possible to nest (nesting) parallel regions, but this only has an effect
+if this mode has been activated by calling the OMP_SET_NESTED subroutine
+or by setting the OMP_NESTED environment variable.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*Examples*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include &lt;omp.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int main()
+{
+int row;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#pragma omp parallel private(rank) num_threads(3)
+{
+rank=omp_get_thread_num();
+printf("My rank in region 1: %d \n",rank);</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#pragma omp parallel private(rank) num_threads(2)
+{
+rank=omp_get_thread_num();
+printf(" My rank in region 2: %d \n",rank);
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}
+return 0;
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>My rank in region 1: 0
+My rank in region 2: 1
+My rank in region 2: 0
+My rank in region 1: 2
+My rank in region 2: 1
+My rank in region 2: 0
+My rank in region 1: 1
+My rank in region 2: 0
+My rank in region 2: 1</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Work sharing</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>In principle, building a parallel region and using a few OpenMP
+functions alone is enough to parallelize a piece of code.</p>
+</li>
+<li>
+<p>But, in this case, it is up to the programmer to distribute the work
+as well as the data and to ensure the synchronization of the tasks.</p>
+</li>
+<li>
+<p>Fortunately, OpenMP offers three directives (DO, SECTIONS and
+WORKSHARE) which easily allow fairly fine control over the distribution
+of work and data as well as synchronization within a parallel region.</p>
+</li>
+<li>
+<p>In addition, there are other OpenMP constructs that allow the
+exclusion of all but one task to execute a piece of code located in a
+parallel region.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Parallel loop</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>It is a parallelism by distribution of the iterations of a loop.</p>
+</li>
+<li>
+<p>The parallelized loop is the one immediately following the DO
+directive.</p>
+</li>
+<li>
+<p>"Infinite" and do while loops are not parallelizable with OpenMP.</p>
+</li>
+<li>
+<p>The mode of distribution of iterations can be specified in the
+SCHEDULE clause.</p>
+</li>
+<li>
+<p>Choosing the distribution mode provides more control over balancing
+the workload between tasks.</p>
+</li>
+<li>
+<p>Loop indices are private integer variables.</p>
+</li>
+<li>
+<p>By default, a global synchronization is performed at the end of the
+END DO construction unless the<br>
+NOWAIT clause has been specified.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>SCHEDULE clause</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>STATIC dispatching consists of dividing the iterations into packets of
+a given size (except perhaps for the last one). A set of packets is then
+assigned cyclically to each of the tasks, following the order of the
+tasks up to the total number of packets. We could have deferred the
+choice of the mode of distribution of the iterations using the
+OMP_SCHEDULE environment variable. The choice of the distribution mode
+of the iterations of a loop can be a major asset for balancing the
+workload on a machine whose processors are not dedicated. Caution, for
+vector or scalar performance reasons, avoid parallelizing loops
+referring to the first dimension of a multi-dimensional array.</p>
+</li>
+<li>
+<p>DYNAMIC: iterations are divided into packets of given size. As soon as
+a task exhausts its iterations, another packet is assigned to it.</p>
+</li>
+<li>
+<p>GUIDED: the iterations are divided into packets whose size decreases
+exponentially. All the packets have a size greater than or equal to a
+given value except for the last whose size may be less. As soon as a
+task completes its iterations, another iteration package is assigned to
+it.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Case of an ordered execution</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>It is sometimes useful (debugging cases) to execute a loop in an
+orderly fashion.</p>
+</li>
+<li>
+<p>The order of the iterations will then be identical to that
+corresponding to a sequential execution.</p>
+</li>
+<li>
+<p>A reduction is an associative operation applied to a shared variable.</p>
+</li>
+<li>
+<p>The operation can be:</p>
+</li>
+<li>
+<p>arithmetic: +, --, *;<br>
+logic: .AND., .OR., .EQV., .NEQV. ;<br>
+an intrinsic function: MAX, MIN, IAND, IOR, IEOR.</p>
+</li>
+<li>
+<p>Each task calculates a partial result independently of the others.
+They then sync to update the final result.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Parallel sections</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>A section is a portion of code executed by one and only one task.</p>
+</li>
+<li>
+<p>Multiple portions of code can be defined by the user using the SECTION
+directive within a SECTIONS construct.</p>
+</li>
+<li>
+<p>The goal is to be able to distribute the execution of several
+independent portions of code on the different tasks.</p>
+</li>
+<li>
+<p>The NOWAIT clause is allowed at the end of the END SECTIONS construct
+to remove the implicit synchronization barrier.</p>
+</li>
+<li>
+<p>All SECTION directives must appear within the lexical scope of the
+SECTIONS construct.</p>
+</li>
+<li>
+<p>The clauses allowed in the SECTIONS directive are those we already
+know:</p>
+</li>
+<li>
+<p>PRIVATE; FIRSTPRIVATE; LASTPRIVATE; REDUCTION.</p>
+</li>
+<li>
+<p>The PARALLEL SECTIONS directive is a merger of the PARALLEL and
+SECTIONS directives with the union of their respective clauses.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Exclusive execution</strong></p>
+</div>
+<div class="paragraph">
+<p>Sometimes you want to exclude all tasks except one to execute certain
+portions of code included in a parallel region.</p>
+</div>
+<div class="paragraph">
+<p>To do this, OpenMP offers two directives SINGLE and MASTER.</p>
+</div>
+<div class="paragraph">
+<p>Although the aim is the same, the behavior induced by these two
+constructions remains quite different.</p>
+</div>
+<div class="paragraph">
+<p>Parallel sections</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>A section is a portion of code executed by one and only one task.</p>
+</li>
+<li>
+<p>Multiple portions of code can be defined by the user using the SECTION
+directive within a SECTIONS construct.</p>
+</li>
+<li>
+<p>The goal is to be able to distribute the execution of several
+independent portions of code on the different tasks.</p>
+</li>
+<li>
+<p>The NOWAIT clause is allowed at the end of the END SECTIONS construct
+to remove the implicit synchronization barrier.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Exclusive execution</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Sometimes you want to exclude all tasks except one to execute certain
+portions of code included in a parallel region.</p>
+</li>
+<li>
+<p>To do this, OpenMP offers two directives SINGLE and MASTER.</p>
+</li>
+<li>
+<p>Although the aim is the same, the behavior induced by these two
+constructions remains quite different.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>SINGLE construction</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>The SINGLE construction allows a portion of code to be executed by one
+and only one task without being able to specify which one.</p>
+</li>
+<li>
+<p>In general, it is the task which arrives first on the SINGLE
+construction but it is not specified in the standard.</p>
+</li>
+<li>
+<p>All the tasks not executing the SINGLE region wait, at the end of the
+END SINGLE construction, for the termination of the one responsible for
+it, unless they have specified the NOWAIT clause.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>MASTER building</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>The MASTER construction allows a portion of code to be executed by the
+master task alone.</p>
+</li>
+<li>
+<p>This construction does not admit any clauses.</p>
+</li>
+<li>
+<p>There is no synchronization barrier either at the beginning (MASTER)
+or at the end of construction (END MASTER).</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Synchronizations</strong></p>
+</div>
+<div class="paragraph">
+<p>Synchronization becomes necessary in the following situations:</p>
+</div>
+<div class="paragraph">
+<p>1. to ensure that all concurrent tasks have reached the same
+level of instruction in the program (global barrier);</p>
+</div>
+<div class="paragraph">
+<p>2. to order the execution of all the concurrent tasks when these
+must execute the same portion of code affecting one or more shared
+variables whose consistency (in reading or in writing) in memory must be
+guaranteed (mutual exclusion).</p>
+</div>
+<div class="paragraph">
+<p>3. to synchronize at least two concurrent tasks among the set
+(lock mechanism).</p>
+</div>
+<div class="paragraph">
+<p>As we have already indicated, the absence of a NOWAIT clause means that
+a global synchronization barrier is implicitly applied at the end of the
+\openmp construction. But it is possible to explicitly impose a global
+synchronization barrier thanks to the BARRIER directive.</p>
+</div>
+<div class="paragraph">
+<p>The mutual exclusion mechanism (one task at a time) is found, for
+example, in reduction operations (REDUCTION clause) or in the ordered
+execution of a loop (DO ORDERED directive). For the same purpose, this
+mechanism is also implemented in the ATOMIC and CRITICAL directives.</p>
+</div>
+<div class="paragraph">
+<p>Finer synchronizations can be achieved either by setting up lock
+mechanisms (this requires calling subroutines from the OpenMP library),
+or by using the FLUSH directive.</p>
+</div>
+<div class="paragraph">
+<p><strong>Barrier</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>The BARRIER directive synchronizes all concurrent tasks in a parallel
+region.</p>
+</li>
+<li>
+<p>Each of the tasks waits until all the others have arrived at this
+synchronization point to continue the execution of the program together.</p>
+</li>
+<li>
+<p>Atomic Update</p>
+</li>
+<li>
+<p>The ATOMIC directive ensures that a shared variable is read and
+modified in memory by only one task at a time.</p>
+</li>
+<li>
+<p>Its effect is local to the statement immediately following the
+directive.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Critical regions</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>A critical region can be seen as a generalization of the ATOMIC
+directive although the underlying mechanisms are distinct.</p>
+</li>
+<li>
+<p>The tasks execute this region in a non-deterministic order but one at
+a time.</p>
+</li>
+<li>
+<p>A critical region is defined using the CRITICAL directive and applies
+to a portion of code terminated by END CRITICAL.</p>
+</li>
+<li>
+<p>Its scope is dynamic.</p>
+</li>
+<li>
+<p>For performance reasons, it is not recommended to emulate an atomic
+instruction by a critical region.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>FLUSH directive</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>It is useful in a parallel region to refresh the value of a shared
+variable in global memory.</p>
+</li>
+<li>
+<p>It is all the more useful when the memory of a machine is
+hierarchical.</p>
+</li>
+<li>
+<p>It can be used to implement a synchronization point mechanism between
+tasks.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>Rules of good performance</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Minimize the number of parallel regions in the code.</p>
+</li>
+<li>
+<p>Adapt the number of tasks requested to the size of the problem to be
+treated in order to minimize the additional costs of task management by
+the system.</p>
+</li>
+<li>
+<p>As much as possible, parallelize the outermost loop.</p>
+</li>
+<li>
+<p>Use the SCHEDULE(RUNTIME) clause to be able to dynamically change the
+scheduling and the size of the iteration packets in a loop.</p>
+</li>
+<li>
+<p>The SINGLE directive and the NOWAIT clause can make it possible to
+reduce the rendering time at the cost, most often, of an explicit
+synchronization.</p>
+</li>
+<li>
+<p>The ATOMIC directive and the REDUCTION clause are more restrictive but
+more powerful than the CRITICAL directive.</p>
+</li>
+<li>
+<p>Use the IF clause to implement conditional parallelization (eg on a
+vector architecture, only parallelize a loop if its length is long
+enough).</p>
+</li>
+<li>
+<p>Inter-task conflicts (of memory bank on a vector machine or of cache
+faults on a scalar machine), can significantly degrade performance.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>OpenMP keywords</strong></p>
+</div>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 100%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="sect1">
+<h2 id="_directive_atomic_barrier_critical_flush_ordered"><a class="anchor" href="#_directive_atomic_barrier_critical_flush_ordered"></a>1. Directive (atomic, barrier, critical, flush, ordered, ….)</h2>
+<div class="sectionbody">
+
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>An OpenMP executable directive applies to the succeeding structured
+block or an OpenMP Construct. A “structured block” is a single statement
+or a compound statement with a single entry at the top and a single exit
+at the bottom.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>The *parallel* construction forms To team of threads and starts parallel
+execution.</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp parallel* _[clause[ [_ *,* _]clause] ...] new-line
+structured-block_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_clause_ : *if(* _scalar- expression_ *)*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*num_threads(* _integer-expression_ *) default(shared*  *none)
+private(* _list_ *) firstprivate(* _list_ *)*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*shared(* _list_ *) copyin(* _list_ *) reduce(* _operator_ *:* _list_
+*)s*</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>loop</strong> construction specifies that the iterations of loops will be
+distributed among and executed by the encountering team of threads.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp for* _[clause[[_ *,* _] clause] ... ] new-line for-loops_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_clause_ : *private(* _list_ *)*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*firstprivate(* _list_ *) lastprivate(* _list_ *) reduce(* _operator_
+*:* _list_ *) schedule(* _kind[, chunk_size]_ *) collapse(* _n_ *)*
+*ordered nowait*</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>sections</strong> construct contains a set of structured blocks that are to be
+distributed among and executed by the meeting team of threads.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp sections* _[clause[[_ *,* _] clause] ...] new line_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*{*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_[_ *#pragma comp section* _new-line] structured-block_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_[_ *#pragma comp section* _new-line structured-block ]_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_clause_ : *private(* _list_ *)*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*firstprivate(* _list_ *)
+lastprivate(* _list_ *) reduce(* _operator_
+*:* _list_ *) nowait*</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>single</strong> construction specifies that the associated structured block is
+executed by only one of the threads in the team (not necessarily the
+master thread), in the context of its implicit task.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp single* _[clause[[_ *,* _] clause] ...] new-line
+structured-block_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_clause_ : *private(* _list_ *)*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*firstprivate(* _list_ *) copyprivate(* _list_ *) nowait*</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>The combined parallel worksharing constructs are a shortcut for
+specifying a parallel construct containing one worksharing construct and
+no other statements. Allowed clauses are the union of the clauses
+allowed for the <strong>parallel</strong> and worksharing constructs.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp parallel for* _[clause[[_ *,* _] clause] ...] new-line
+for-loop_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp parallel sections* _[clause[ [_ *,* _]clause] ...]
+new-line_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*{*
+_[_ *#pragma comp section* _new-line] structured-block_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_[_ *#pragma comp section* _new-line structured-block ]_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_..._
+*#pragma comp task* _[clause[ [_ *,* _]clause] ...] new-line
+structured-block_
+_clause_ : *if(* _scalar- expression_ *)*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>=== untied</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*default(shared  none) private(* _list_ *) firstprivate(* _list_ *)
+shared(* _list_ *)*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*Master* construction specifies To structured block that is executed by
+the Master thread of the team. There is no implied barriers either on
+entry to, or exit from, the master construct.</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp Master* _new-line structured-block_</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>critical</strong> construct restricts execution of the associated structured
+block to a single thread at a time.</p>
+</div>
+<div class="paragraph">
+<p><strong>#pragma comp critical</strong> <em>[</em> <strong>(</strong> <em>name</em> <strong>)</strong> <em>] new-line structured-block</em></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>The *barriers* construction specifies year explicit barriers did the
+point did which the construct appears.</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp barriers* _new- line_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>The *taskwait* construction specifies To wait we the completion of child
+tasks generated since the beginning of the current task.</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp you asked* _new line_</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>atomic</strong> construction ensures that To specific storage lease is updated
+atomically, rather than exposing it to the possibility of multiple,
+simultaneous writing threads.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp atomic* _new-line expression-stmt_</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_stmt-expression_ : one of the following forms:</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>_x binop_ *=* _expr x_ *++*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*++* _x x_ *- -*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*--x* ___</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>flush</strong> construction execute the OpenMP flush operation, which makes a
+thread&#8217;s temporary view of memory consist with memories, and enforces an
+order on the memory operations of the variables.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp flush* _[_ *(* _list_ *)* _] new- line_</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>The <strong>ordered</strong> construct specifies a structured block in a loop region
+that will be executed in the order of the loop iterations. This
+sequentializes and orders the code within an ordered region while
+allowing code outside the region to run in parallel.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp ordered* _new-line structured-block_</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="literalblock">
+<div class="content">
+<pre>*threadprivate* guideline specifies that variables are replicated, with
+each thread having its own copy.</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma comp threadprivate* _( list) new- line_</pre>
+</div>
+</div></div></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 27%;">
+<col style="width: 73%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="sect2">
+<h3 id="_parallel_execution"><a class="anchor" href="#_parallel_execution"></a>1. Parallel Execution</h3>
+
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">A Simple Parallel Loop</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>The loop iteration variable is private by default, so it is not
+necessary to specify it explicitly in a private clause</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void simple(int n, float *a, float *b)
+{
+int i;
+*#pragma omp parallel for*
+for (i=1; i&lt;n; i++) /* i is private by default */
+b[i] = (a[i] + a[i-1]) / 2.0;
+}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>_</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The Parallel Construct</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="literalblock">
+<div class="content">
+<pre>The parallel construct can be used in coarse-grain parallel programs._</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void subdomain(float *x, int istart, int ipoints)
+{
+int i;
+for (i = 0; i &lt; ipoints; i++)
+x[istart+i] = 123.456;
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void sub(float *x, int npoints)
+{
+int iam, nt, ipoints, istart;
+*#pragma omp parallel default(shared) private(iam,nt,ipoints,istart)*
+{
+iam = omp_get_thread_num();
+nt = omp_get_num_threads();
+ipoints = npoints / nt; /* size of partition */
+istart = iam * ipoints; /* starting array index */
+if (iam == nt-1) /* last thread may do more */
+ipoints = npoints - istart;
+subdomain(x, istart, ipoints);
+}
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>main()
+{
+float array[10000]
+sub(array, 10000)
+return 0;
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Controlling the Number of threads on Multiple Nesting Levels</p></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The
+OMP_NUM_THREADS environment variable to control the number of threads on
+multiple nesting levels</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Interaction Between the num_threads Clause and omp_set_dynamic</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>The call to the omp_set_dynamic routine with argument 0 in C/C++,
+disables the dynamic adjustment of the number of threads in OpenMP
+implementations that support it.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include &lt;omp.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int main()
+{
+omp_set_dynamic(0);
+*#pragma omp parallel num_threads(10)*
+{
+/* do work here */
+}
+return 0;
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The nowait Clause</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>If there are multiple independent loops within a parallel region, you
+can use the nowait clause to avoid the implied barrier at the end of the
+loop construct</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include &lt;math.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void nowait_example(int n, int m, float *a, float *b, float *y, float *z)
+{
+int i;
+*#pragma omp parallel*
+{
+*#pragma omp for nowait*
+for (i=1; i&lt;n; i++)
+b[i] = (a[i] + a[i-1]) / 2.0;
+*#pragma omp for nowait*
+for (i=0; i&lt;m; i++)
+y[i] = sqrt(z[i]);
+}
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The collapse Clause</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>The collapse clause is used since it is implicitly private. The collapse
+clause associates one or more loops with the directive on which it
+appears for the purpose of identifying the portion of the depth of the
+canonical loop nest to which to apply the semantics of the directive.
+The argument n speciﬁes the number of loops of the associated loop nest
+to which to apply those semantics. On all directives on which the
+collapse clause may appear, the eﬀect is as if a value of one was
+speciﬁed for n if the collapse clause is not speciﬁed.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void bar(float *a, int i, int j, int k);</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int kl, ku, ks, jl, ju, js, il, iu,is;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void sub(float *a)
+{
+int i, j, k;
+*#pragma omp for collapse(2) private(i, k, j)*
+for (k=kl; k&lt;=ku; k+=ks)
+for (j=jl; j&lt;=ju; j+=js)
+for (i=il; i&lt;=iu; i+=is)
+bar(a,i,j,k);
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Linear Clause in Loop Constructs</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>The linear clause in a loop construct to allow the proper
+parallelization of a loop that contains an induction variable (<em>j</em>). At
+the end of the execution of the loop construct, the original variable
+<em>j</em> is updated with the value <em>N/2</em> from the last iteration of the loop.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include &lt;stdio.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#define N 100</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int main(void)
+{
+float a[N], b[N/2];
+int i, j;
+for(i = 0;i&lt;N;i++)
+a[i] = i+1;
+j=0
+*#pragma omp parallel*
+*#pragma omp for linear(j:1)*
+for(i=0;i&lt;N;i+=2){
+b[j]= a[i] * 2.0f;
+j++;
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>printf"%d %f %f\n", j, b[0], b[j-1] );
+/* print out: 50 2.0 198.0 */
+return 0;
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The firstprivate Clause and the sections Construct</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>The firstprivate clause is used to initialize the private copy of
+section_count of each thread. The problem is that the section constructs
+modify section_count, which breaks the independence of the section
+constructs. When different threads execute each section, both sections
+will print the value 1. When the same thread executes the two sections,
+one section will print the value 1 and the other will print the value 2.
+Since the order of execution of the two sections in this case is
+unspecified, it is unspecified which section prints which value.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include &lt;stdio.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#define NT 4</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int main( ) {</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int section_count = 0;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*omp_set_dynamic(0);*
+*omp_set_num_threads(NT);*
+*#pragma omp parallel*
+*#pragma omp sections firstprivate( section_count )*
+{</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma omp section*
+{
+section_count++;
+/* may print the number one or two */
+printf( "section_count %d\n", section_count );</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma omp section*
+{
+section_count++;
+/* may print the number one or two */
+printf( "section_count %d\n", section_count );
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>return 0;
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The single Construct</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Only one thread prints each of the progress messages. All other threads
+will skip the single region and stop at the barrier at the end of the
+single construct until all threads in the team have reached the barrier.
+If other threads can proceed without waiting for the thread executing
+the single region, a nowait clause can be specified, as is done in the
+third single construct in this example. The user must not make any
+assumptions as to which thread will execute a single region.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include &lt;stdio.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void work1() {}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void work2() {}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void single_example()</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma omp parallel*
+{
+*#pragma omp single*
+printf("Beginning work1.\n");
+work1();
+*#pragma omp single*
+printf("Finishing work1.\n");
+*#pragma omp single nowait*
+printf("Finished work1 and beginning work2.\n");
+work2();
+}
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The master Construct</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="literalblock">
+<div class="content">
+<pre>#include &lt;stdio.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>extern float average(float,float,float);
+void master_example( float* x, float* xold, int n, float tol )
+{
+int c, i, toobig;
+float error, y;
+c = 0;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#*pragma omp parallel*
+{
+do {
+*#pragma omp for private(i)*
+for( i = 1; i &lt; n-1; ++i ){
+xold[i] = x[i];
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma omp single*
+{
+toobig = 0;
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma omp for private(i,y,error) reduction(+:toobig)*
+for(i=1; i&lt;n-1;++i){
+y = x[i];
+x[i] = average( xold[i-1], x[i], xold[i+1] );
+error = y - x[i];
+if( error &gt; tol or error &lt; -tol ) ++toobig;
+}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma omp master*
+{
+++c;
+printf( "iteration %d, toobig=%d\n", c, toobig );
+}
+} while( toobig &gt; 0 );
+}
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Parrallel Random Access Iterator Loop</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="literalblock">
+<div class="content">
+<pre>#include &lt;vector&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void iterator_example()</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>{
+std::vector&lt;int&gt; vec(23);
+std::vector&lt;int&gt;::iterator it;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*#pragma omp parallel for default(none) shared(vec)*
+for (it = vec.begin(); it &lt; vec.end(); it++)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>{
+// do work with *it //
+}
+}</pre>
+</div>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The omp_set_dynamic and omp_set_num_threads Routines</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Some programs rely on a fixed, prespecified number of threads to execute
+correctly. Because the default setting for the dynamic adjustment of the
+number of threads is implementation defined, such programs can choose to
+turn off the dynamic threads capability and set the number of threads
+explicitly to ensure portability.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include &lt;omp.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include &lt;stdlib.h&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void do_by_16(float *x, int iam, int ipoints) {}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void dynthreads(float *x, int npoints)
+{
+int iam, ipoints;
+*omp_set_dynamic(0);*
+*omp_set_num_threads(16);*
+*#pragma omp parallel shared(x, npoints) private(iam, ipoints)*
+{
+if (omp_get_num_threads() != 16) abort();
+iam = omp_get_thread_num();
+ipoints = npoints/16;
+do_by_16(x, iam, ipoints);
+}
+}</pre>
+</div>
+</div></div></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 26%;">
+<col style="width: 74%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="sect2">
+<h3 id="_clauses_data_sharing_attribute"><a class="anchor" href="#_clauses_data_sharing_attribute"></a>1. <strong>Clauses: Data Sharing attribute</strong></h3>
+
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><em>Data sharing attribute clauses apply only to variables whose names are
+visible in the construct on which the clause appears. Not all of the
+clauses are valid on all directives. The set of clauses that is valid we
+To particular guideline is described with the directive. Most of the
+clauses accept a comma-separated list of list items. All list items
+appearing in a clause must be visible.</em></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>default(shared none);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Controls the default data sharing attributes of variables that are
+referenced in a <strong>parallel</strong> or <strong>task</strong> construct.</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>shared(</strong> <em>list</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Declared one gold more list items to be shared by tasks generated by a
+<strong>parallel</strong> or <strong>task</strong> construct.</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>private(</strong> <em>list</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Declared one or more list items to be private to a task.</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>firstprivate(</strong> <em>list</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Declared one gold more list items to be private to To task, and
+initialize each of them with the value that the corresponding original
+item has when the construct is encountered.</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>lastprivate(</strong> <em>list</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Declares one or more list items to be private to an implicit task, and
+causes the corresponding original item to be updated after the end of
+the region.</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>reduce(</strong> <em>operator</em> <strong>:</strong> <em>list</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Declares accumulation into the list items using the indicated
+associative operator. Accumulation occurs into To private copy for each
+list item which is then combined with the original item.</p>
+</div></div></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 24%;">
+<col style="width: 76%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="sect2">
+<h3 id="_clauses_data_copying"><a class="anchor" href="#_clauses_data_copying"></a>1. Clauses: Data copying</h3>
+
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><em>Thesis clauses support the copying of data values from private gold
+thread- private variables on one implicit task or thread to the
+corresponding variables on other implicit tasks or threads in the team.</em></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>copyin(</strong> <em>list</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Copies the value of the master thread&#8217;s <em>threadprivate</em> variable to the
+<em>threadprivate</em> variable of each other member of the team executing the
+<strong>parallel</strong> region.</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>copyprivate(</strong> <em>list</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>Broadcasts a value from the data environment of one implicit task to the
+data environments of the other implied tasks belonging to the <strong>parallel</strong>
+region.</p>
+</div></div></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 39%;">
+<col style="width: 61%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="sect2">
+<h3 id="_execution_environment_routines_function"><a class="anchor" href="#_execution_environment_routines_function"></a>1. Execution Environment Routines Function</h3>
+
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><em>Execution environment routines affect and monitor threads, processors,
+and the parallel environment. Lock routines support synchronization with
+OpenMP locks. Timing routines support a portable wall clock timer.
+prototypes for the runtime library routines are defined in the queue
+“omp.h”.</em></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"></td>
+<td class="tableblock halign-left valign-top"><div class="content"></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_set_num_threads(int* <em>num_threads</em> *);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Affects the number of threads used for subsequent <strong>parallel</strong> regions
+that do not specify To <strong>num_threads</strong> clause.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_num_threads(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns the nusmber of threads in the current team.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_max_threads(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns maximum number of threads that could be used to form To new
+team using a “parallel” construct without has “num_threads” clause.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_thread_num(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns tea ID of the meeting thread where ID rows from zero to the
+size of the team minus 1.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_num_procs(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns the number of processors available to the program.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_in_parallel(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns <em>true</em> if the call to the routine is enclosed by an active
+<strong>parallel</strong> region; otherwise, it returns <em>false</em> .</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_set_dynamic(int* <em>dynamic_threads</em> *);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Enables gold disables dynamic adjustments of the number of threads
+available.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_dynamic(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns the value of the <em>dyn-var</em> internal control variable (ICV),
+determining whether dynamic adjustments of the number of threads is
+enabled or disabled.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_set_nested(int <em>nested</em> );</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Enables gold disables nested parallelism, by setting the <em>nest-var</em>
+ICV.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_nested(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns the value of the <em>nest-var</em> LCI, which determined if nested
+parallelism is enabled or disabled.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_set_schedule(omp_sched_t* <em>kind</em> <strong>, int</strong> <em>modify</em> *);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Affects the schedule that is applied when <strong>run-time</strong> is used as
+schedule kind, by setting the value of the <em>run-sched-var</em> ICV.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_get_schedule (omp_sched_t *kind, int *edit)s;</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns the schedule applied when <strong>run-time</strong> schedule is used.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_thread_limit(void)*</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns the maximum number of OpenMP
+threads available to the program.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_thread_limit(void)*</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns the maximum number of OpenMP
+threads available to the program.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_set_max_active_levels(int* <em>max_levels</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Limits the
+number of nested active <strong>parallel</strong> regions, by setting the
+<em>max-active-levels-var</em> ICV.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_max_active_levels(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns tea value of tea <em>max-activelevels-var LCI</em> , which determines
+the maximum number of nested active <strong>parallel</strong> regions.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_level(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns tea number of nested <strong>parallel</strong> regions enclosing tea task that
+contains the call.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_ancestor_thread_num(int <em>level</em> );</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns, for To given nested level of tea current thread, tea thread
+number of the ancestor or the current thread.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_team_size(int <em>level</em> );</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns, for To given nested level of tea current thread, tea size of
+the thread team to which the ancestor or the current thread belongs.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_get_active_level(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns tea number of nested, active <strong>parallel</strong> regions enclosing the
+task that contains the call.</p></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 41%;">
+<col style="width: 59%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="sect2">
+<h3 id="_lock_routines"><a class="anchor" href="#_lock_routines"></a>1. Lock Routines</h3>
+
+</div></div></td>
+<td class="tableblock halign-left valign-top"></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_init_lock(omp_lock_t * <em>lock</em> );</p>
+</div>
+<div class="paragraph">
+<p><strong>void omp_init_nest_lock(omp_nest_lock_t *</strong> <em>lock</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Routines initialize year OpenMP lock.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_destroy_lock(omp_lock_t * <em>lock</em> );</p>
+</div>
+<div class="paragraph">
+<p><strong>void omp_destroy_nest_lock(omp_nest_lock_t *</strong> <em>lock</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Routines ensure that the OpenMP lock is uninitialized.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_set_lock(omp_lock_t * <em>lock</em> );</p>
+</div>
+<div class="paragraph">
+<p><strong>void omp_set_nest_lock(omp_nest_lock_t *</strong> <em>lock</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Routines provide To means of setting year OpenMP lock.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>void omp_unset_lock(omp_lock_t * <em>lock</em> );</p>
+</div>
+<div class="paragraph">
+<p><strong>void omp_unset_nest_lock(omp_nest_lock_t *</strong> <em>lock</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Routines provide To means of setting year OpenMP lock.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>int omp_test_lock(omp_lock_t * <em>lock</em> );</p>
+</div>
+<div class="paragraph">
+<p><strong>int omp_test_nest_lock(omp_nest_lock_t *</strong> <em>lock</em> <strong>);</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Routines attempt to set year OpenMP lock aim do not suspend execution
+of the task executing the routine.</p></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 41%;">
+<col style="width: 59%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="sect2">
+<h3 id="_timing_routines"><a class="anchor" href="#_timing_routines"></a>1. Timing Routines</h3>
+
+</div></div></td>
+<td class="tableblock halign-left valign-top"></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>double omp_get_wtime(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns elapsed wall clock time in seconds.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>double omp_get_wtick(void);</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Returns the precision of the timer used by <strong>omp_get_wtime</strong> .</p></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 35%;">
+<col style="width: 65%;">
+</colgroup>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="sect2">
+<h3 id="_environment_variables"><a class="anchor" href="#_environment_variables"></a>1. Environment Variables</h3>
+
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock"><em>Environment variable names are upper case, and the values assigned to
+them are box insensitive and May have leading and trailing white space.</em></p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>OMP_SCHEDULE* <em>type</em> <strong>[,</strong> <em>chunk</em> *]</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Sets the <em>run-sched-var</em> ICV for the runtime schedule type and chunk
+size. Valid OpenMP schedule types are <strong>static</strong> <em>,</em> <strong>dynamic</strong> <em>,</em>
+<strong>guided</strong> , or <strong>auto</strong> . <em>Chunk</em> is a positive integer.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>OMP_NUM_THREADS <em>number</em></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Sets the <em>nthreads-var</em> LCI for tea number of threads to worn for
+<strong>parallel</strong> regions.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>OMP_DYNAMIC</strong> <em>dynamic</em></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Sets the <em>dyn-var</em> ICV <em>for</em> the dynamic adjustment of threads to use
+for <strong>parallel</strong> regions. Valid values for <em>dynamic</em> are <strong>true</strong> gold
+<strong>false</strong> .</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>OMP_NESTED</strong> <em>nested</em></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Sets the <em>nest-var</em> LCI to enable gold to disable nested parallelism.
+Valid values for <em>nested</em> are true or false.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>OMP_STACKSIZE</strong> <em>size</em></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Sets the <em>stacksize-var</em> ICV that specifies the size of the stack for
+threads created by the OpenMP implementation. Valid values for <em>size</em> (a
+positive integer) are <em>size</em> , <em>size</em> <strong>B</strong> , <em>size</em> <strong>K</strong> , <em>size</em> <strong>M</strong> ,
+<em>size</em> <strong>G.</strong> _ Yew units <strong>B</strong> , <strong>K</strong> , <strong>M</strong> or <strong>G</strong> are not specified, size
+is measured in kilobytes ( <strong>K</strong> ).</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>OMP_WAIT_POLICY</strong> <em>policy</em></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Sets the <em>wait-policy-var</em> ICV that controls the desired behavior of
+waiting threads. Valid values for <em>policy</em> are <strong>active</strong> (waiting threads
+consume processor cycles while waiting) and <strong>passive</strong> .</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>OMP_MAX_ACTIVE_LEVELS</strong> <em>levels</em></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Sets tea <em>max-active-levels-var</em> LCI that controls the maximum number
+of nested active <strong>parallel</strong> regions.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>OMP_THREAD_LIMIT</strong> <em>limit</em></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Sets tea <em>thread-limit-var</em> LCI that controls the maximum number of
+threads participating in the OpenMP program.</p></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 35%;">
+<col style="width: 65%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top">Operators legally allowed in at discount</th>
+<th class="tableblock halign-left valign-top"></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>Operator</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>Initialization value</strong></p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>+</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>*</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>-</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>&amp;</p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>~0</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"></div></td>
+<td class="tableblock halign-left valign-top"></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>^</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">0</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p>&amp;&amp;</p>
+</div></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><p class="tableblock">1</p></td>
+<td class="tableblock halign-left valign-top"><div class="content"></div></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"></td>
+<td class="tableblock halign-left valign-top"></td>
+</tr>
+</tbody>
+</table>
+<table class="tableblock frame-all grid-all stretch">
+<colgroup>
+<col style="width: 22%;">
+<col style="width: 78%;">
+</colgroup>
+<thead>
+<tr>
+<th class="tableblock halign-left valign-top"><strong>Schedule types for the loop construct</strong></th>
+<th class="tableblock halign-left valign-top"></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>static</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Iterations are divided into chunks of size <em>chunk_size</em> , and the
+chunks are assigned to the threads in the team in a round-robin fashion
+in the order of the thread number.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>dynamic</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Each thread execute To chunk of iterations, then requests another
+chunk, until no chunks remain to be distributed.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>guided</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">Each thread execute To chunk of iterations, then requests another
+chunk, until no chunks remain to be assigned. The chunk sizes start
+large and shrink to the indicated <em>chunk_size</em> as chunks are scheduled.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>car</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The decision regarding scheduling is delegated to the compiler and/or
+runtime system.</p></td>
+</tr>
+<tr>
+<td class="tableblock halign-left valign-top"><div class="content"><div class="paragraph">
+<p><strong>run-time</strong></p>
+</div></div></td>
+<td class="tableblock halign-left valign-top"><p class="tableblock">The schedule and chunk size are taken from the run-sched-var ICV.</p></td>
+</tr>
+</tbody>
+</table>
+<div class="paragraph">
+<p><strong>2.3 Hybrid MPI and OpenMP</strong></p>
+</div>
+<div class="paragraph">
+<p>Hybrid application programs using MPI + OpenMP are now commonplace on
+large HPC systems. There are basically two main motivations for this
+combination of programming models:</p>
+</div>
+<div class="paragraph">
+<p>1. Reduced memory footprint, both in the application and in the
+MPI library (eg communication buffers).</p>
+</div>
+<div class="paragraph">
+<p>2. Improved performance, especially at high core counts where
+pure MPI scalability runs out.</p>
+</div>
+<div class="paragraph">
+<p>A common hybrid approach</p>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image9.png" alt="image" width="307" height="155"></span></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>From dequential code, alongside MPI first, then try adding OpenMP</p>
+</li>
+<li>
+<p>From MPI code, add OpenMP</p>
+</li>
+<li>
+<p>From OpenMP code, treat as serial code</p>
+</li>
+<li>
+<p>The simplest and least error-prone method is to use MPI outside the
+parallel region and allow only the master thread to communicate between
+MPI tasks.</p>
+</li>
+<li>
+<p>Could use MPI in parallel region with thread-safe MPI.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image10.png" alt="image" width="264" height="166"></span></p>
+</div>
+</div>
+</article>
+  </div>
+</main>
+</div>
+<footer class="footer" style="border-top: 2px solid #e9e9e9; background-color: #fafafa; padding-bottom: 2em; padding-top: 2em;">
+    <div class="container" style="display: flex; flex-direction: column; align-items: center; gap: 0.5em;">
+        <div>
+            <a href="https://www.cemosis.fr">
+                <img src="../_/img/cemosis-logo.svg" alt="Cemosis logo" height="50">
+            </a>
+        </div>
+        <span style="font-size: 0.8rem; color: #9e9e9e">© 2023 <a href="https://www.cemosis.fr" style="text-decoration: underline;">Cemosis</a>, Université de Strasbourg</span>
+    </div>
+</footer>
+<script id="site-script" src="../_/js/site.js" data-ui-root-path="../_"></script>
+
+
+<script async src="../_/js/vendor/fontawesome-icon-defs.js"></script>
+<script async src="../_/js/vendor/fontawesome.js"></script>
+<script async src="../_/js/vendor/highlight.js"></script>
+
+
+<script type="text/javascript">
+function toggleFullScreen() {
+   var doc = window.document;
+   var docEl = doc.documentElement;
+
+   var requestFullScreen = docEl.requestFullscreen || docEl.mozRequestFullScreen || docEl.webkitRequestFullScreen || docEl.msRequestFullscreen;
+   var cancelFullScreen = doc.exitFullscreen || doc.mozCancelFullScreen || doc.webkitExitFullscreen || doc.msExitFullscreen;
+
+   if(!doc.fullscreenElement && !doc.mozFullScreenElement && !doc.webkitFullscreenElement && !doc.msFullscreenElement) {
+       requestFullScreen.call(docEl);
+   }
+   else {
+       cancelFullScreen.call(doc);
+   }
+}
+</script>
+  </body>
+</html>
diff --git a/parallel-programming/PPChapter3.html b/parallel-programming/PPChapter3.html
new file mode 100644
index 0000000..956bbe7
--- /dev/null
+++ b/parallel-programming/PPChapter3.html
@@ -0,0 +1,341 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width,initial-scale=1">
+    <title>Untitled :: Parallel Programming</title>
+    <link rel="canonical" href="https://feelpp.github.io/parallel-programming/parallel-programming/PPChapter3.html">
+    <meta name="generator" content="Antora 3.1.3">
+    <link rel="stylesheet" href="../_/css/site.css">
+<link rel="icon" href="../_/img/favicon.ico" type="image/x-icon">
+<script>!function(l,p){if(l.protocol!==p&&l.host=="docs.antora.org"){l.protocol=p}else if(/\.gitlab\.io$/.test(l.host)){l.replace(p+"//docs.antora.org"+l.pathname.substr(l.pathname.indexOf("/",1))+l.search+l.hash)}}(location,"https:")</script>
+
+<script src="../_/js/vendor/tabs-block-extension.js"></script>
+<script src="../_/js/vendor/tabs-block-behavior.js"></script>
+
+
+
+<script type="text/x-mathjax-config">
+MathJax.Hub.Config({
+  messageStyle: "none",
+  tex2jax: {
+    inlineMath: [['$','$'], ['\\(','\\)']],
+    displayMath: [['$$','$$'], ['\\[','\\]']],
+    processEscapes: true,
+    processEnvironments: true,
+    ignoreClass: "nostem|nolatexmath"
+  },
+  asciimath2jax: {
+    delimiters: [["\\$", "\\$"]],
+    ignoreClass: "nostem|noasciimath"
+  },
+
+  TeX: {
+      Macros: {
+      bold: ["{\\bf #1}",1],
+      calTh: "{\\mathcal{T}_h}",
+      card: ["{\\operatorname{card}(#1)}",1],
+      card: ["{\\operatorname{card}(#1)}",1],
+      Ck: ["{\\mathcal{C}^{#1}}",1],
+      deformt: ["{\\mathbf{\\varepsilon(#1)}}",1],
+      diam: "{\\operatorname{diam}}",
+      dim: ["{\\operatorname{dim}(#1)}",1],
+      disp: ["{\\mathbf{#1}}",1],
+      domain: "{\\Omega}",
+      ds: "",
+      essinf: "{\\operatorname{ess}\\, \\operatorname{inf}}",
+      F:"{\\mathcal{F}}",
+      geo: "{\\mathrm{geo}}",
+      Ich: ["{\\mathcal{I}^{#1}_{c,h}#2}",2],
+      Id: "{\\mathcal{I}}",
+      Ilag: ["{\\mathcal{I}^{\\mathrm{lag}}_{#1}}",1],
+      jump: ["{[\\![ #1 ]\\!]}",1],
+      n:"{\\mathbf{n}}",
+      Ne: "{N_{\\mathrm{e}}}",
+      Next: "{\\mathrm{n}}",
+      nf: "{n_f}",
+      ngeo: "{n_{\\mathrm{geo}}}",
+      Nma: "{N_{\\mathrm{ma}}}",
+      NN: "{\\mathbb N}",
+      Nno: "{N_{\\mathrm{no}}}",
+      Nso: "{N_{\\mathrm{so}}}",
+      opdim: "{\\operatorname{dim}}",
+      p: "{\\mathrm{p}}",
+      P:"{\\mathcal{P}}",
+      Pch: ["{P^{#1}_{c,h}}",1],
+      Pcho: ["{P^{#1}_{c,h,0}}",1],
+      Pk: ["{\\mathcal{P}^{#1}}",1],
+      poly: ["{\\mathbb{#1}",1],
+      poly: ["{\\mathbb{#1}}",1],
+      prect: ["{\\left\\(#1\\right\\)}",1],
+      q:"{\\mathbf{q}}",
+      Qch: ["{Q^{#1}_{c,h}}",1],
+      Qk: ["{\\mathcal{Q}^{#1}}",1],
+      R: ["{\\mathbb{R}^{#1}}",1],
+      RR: "{\\mathbb R}",
+      set: ["{\\left\\{#1\\right\\}}",1],
+      stresst: ["{\\mathbf{\\sigma(#1)}}",1],
+      T:"{\\mathcal{T}}",
+      tr: "{\\operatorname{tr}}",
+      v:"{\\mathbf{v}}",
+      vertiii: ["\\left\\vert\\kern-0.25ex\\left\\vert\\kern-0.25ex\\left\\vert #1 \\right\\vert\\kern-0.25ex\\right\\vert\\kern-0.25ex\\right\\vert",1]
+  },
+  extensions: ["mhchem.js"] 
+  }
+});
+</script>
+<!--<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.3/MathJax.js?config=TeX-MML-AM_CHTML"></script>-->
+<!-- <script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-MML-AM_CHTML' async></script> -->
+<script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-AMS_CHTML'></script>
+<!--<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.6.0/MathJax.js?config=TeX-MML-AM_HTMLorMML"></script>-->
+
+<!--<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.css" integrity="sha384-TEMocfGvRuD1rIAacqrknm5BQZ7W7uWitoih+jMNFXQIbNl16bO8OZmylH/Vi/Ei" crossorigin="anonymous">
+<script src="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.js" integrity="sha384-jmxIlussZWB7qCuB+PgKG1uLjjxbVVIayPJwi6cG6Zb4YKq0JIw+OMnkkEC7kYCq" crossorigin="anonymous"></script>-->
+<script>var uiRootPath = '../_'</script>
+
+  </head>
+  <body class="article">
+<header class="header">
+    <nav class="navbar navbar-expand-sm bg-dark navbar-dark navbar-template-project" style="border-top: 4px solid #9E9E9E">
+        <div class="navbar-brand">
+            <div class="navbar-item feelpp-logo">
+                <a href="https://feelpp.github.io/parallel-programming">Parallel Programming</a>
+            </div>
+            <button class="navbar-burger" data-target="topbar-nav">
+                <span></span>
+                <span></span>
+                <span></span>
+            </button>
+        </div>
+
+        <div id="topbar-nav" class="navbar-menu">
+            <div class="navbar-end">
+                <div class="navbar-item">
+                    <a href="https://docs.feelpp.org/">Documentation Reference</a>
+                </div>
+                <div class="navbar-item has-dropdown is-hoverable download-item">
+                    <div class="navbar-item"><a href="https://docs.feelpp.org/user/latest/install/index.html" class="download-btn">Get Feel++</a></div>
+                </div>
+                <div class="navbar-item">
+                    <a class="navbar-brand"  href="https://www.cemosis.fr">
+                        <img class="cemosis-logo"  src="../_/img/cemosis-logo.svg" alt="Cemosis logo"/>
+                    </a>
+                </div>
+            </div>
+        </div>
+    </nav>
+</header>
+<div class="body">
+<a href="#" class="menu-expand-toggle"></a>
+<div class="nav-container" data-component="parallel-programming" data-version="">
+  <aside class="nav">
+    <div class="panels">
+<div class="nav-panel-menu is-active" data-panel="menu">
+  <nav class="nav-menu">
+    <h3 class="title"><a href="index.html">Template Project</a></h3>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="0">
+<ul class="nav-list">
+  <li class="nav-item" data-depth="1">
+    <a class="nav-link" href="index.html">Introduction</a>
+  </li>
+  <li class="nav-item" data-depth="1">
+    <button class="nav-item-toggle"></button>
+    <span class="nav-text">{Parallel Programming} Environment</span>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item is-current-page" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="cmake.html">cmake environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="antora.html">antora environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="vscode.html">vscode integration</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="githubactions.html">Github Actions</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="rename.html">Renaming the project</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="jupyter.html">Jupyter Notebook</a>
+  </li>
+</ul>
+  </li>
+</ul>
+  </li>
+</ul>
+  </nav>
+</div>
+<div class="nav-panel-explore" data-panel="explore">
+  <div class="context">
+    <span class="title">Template Project</span>
+    <span class="version"></span>
+  </div>
+  <ul class="components">
+      <li class="component">
+        <a class="title" href="../feelpp-antora-ui/index.html">Antora Feel++ UI</a>
+      </li>
+      <li class="component is-current">
+        <a class="title" href="index.html">Template Project</a>
+      </li>
+  </ul>
+</div>
+    </div>
+  </aside>
+</div>
+<main class="article">
+<div class="toolbar" role="navigation">
+  <button class="nav-toggle"></button>
+    <a href="index.html" class="home-link"></a>
+  <nav class="breadcrumbs" aria-label="breadcrumbs">
+  <ul>
+    <li><a href="index.html">Template Project</a></li>
+    <li>{Parallel Programming} Environment</li>
+    <li><a href="PPChapter3.html">Star PU</a></li>
+  </ul>
+</nav>
+
+  
+    <div class="edit-this-page"><a href="https://github.com/feelpp/parallel-programming/edit/lem/docs/modules/ROOT/pages/PPChapter3.adoc">Edit this Page</a></div>
+  
+  <div class="page-downloads">
+  <span class="label">Download as</span>
+  <ul class="download-options">
+    <li>
+      <a onclick="print(this)" href="#" data-toggle="tooltip" data-placement="left" title="Print to PDF"
+         class="pdf-download">
+        <img class="pdf-file-icon icon" src="../_/img/pdf.svg"/> .pdf
+      </a>
+    </li>
+  </ul>
+</div>
+</div>
+
+  <div class="content">
+<aside class="toc sidebar" data-title="Contents" data-levels="2">
+  <div class="toc-menu"></div>
+</aside>
+<article class="doc">
+<div class="sect2">
+<h3 id="_3_star_pu"><a class="anchor" href="#_3_star_pu"></a>1. 3.Star PU</h3>
+<div class="paragraph">
+<p><strong>StarPU</strong> is a task scheduling library for hybrid architectures. StarPU&#8217;s
+goal is to design systems in which applications are distributed across
+the entire machine, powering parallel tasks to all available resources.
+It keeps track of the copies of each of the data in the various memories
+on board the accelerators, and provides mechanisms such as data
+preloading. The calculation time has been greatly reduced, as well as
+the high efficiency in the use of the different calculation resources,
+the different typical workloads, especially in the case of multi-core
+machines equipped with several acceleration machines.</p>
+</div>
+<div class="paragraph">
+<p>The app provides algorithms and constraints</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>CPU/GPU implementations of tasks</p>
+</li>
+<li>
+<p>A task graph, using either StarPU&#8217;s rich C/C++/Fortran/Python API or
+OpenMP pragmas.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p>StarPU internally deals with the following aspects:</p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p>Task dependencies</p>
+</li>
+<li>
+<p>Optimized heterogeneous scheduling</p>
+</li>
+<li>
+<p>Optimized data transfers and replication between main memory and
+discrete memories</p>
+</li>
+<li>
+<p>Optimized cluster communications</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image19.png" alt="image" width="179" height="179"></span></p>
+</div>
+<div class="paragraph">
+<p>Links:</p>
+</div>
+<div class="paragraph">
+<p><a href="https://hpc2n.github.io/Task-based-parallelism/branch/master/starpu1/#hello-world" class="bare">hpc2n.github.io/Task-based-parallelism/branch/master/starpu1/#hello-world</a></p>
+</div>
+<div class="paragraph">
+<p><a href="https://github.com/alucas/StarPU/tree/master" class="bare">github.com/alucas/StarPU/tree/master</a></p>
+</div>
+<div class="paragraph">
+<p><a href="https://hpc2n.github.io/Task-based-parallelism/branch/master/starpu1/#benefits-and-downsides" class="bare">hpc2n.github.io/Task-based-parallelism/branch/master/starpu1/#benefits-and-downsides</a></p>
+</div>
+<div class="paragraph">
+<p><a href="https://indico.math.cnrs.fr/event/6415/attachments/2736/3475/2021.02.24_-" class="bare">indico.math.cnrs.fr/event/6415/attachments/2736/3475/2021.02.24_-</a><em>exa2pro-eocoe_workshop</em>-<em>StarPU</em>-_S._Thibault.pdf</p>
+</div>
+<div class="paragraph">
+<p><a href="https://gitub.u-bordeaux.fr/starpu/starpu/-/tree/master/examples" class="bare">gitub.u-bordeaux.fr/starpu/starpu/-/tree/master/examples</a></p>
+</div>
+</div>
+</article>
+  </div>
+</main>
+</div>
+<footer class="footer" style="border-top: 2px solid #e9e9e9; background-color: #fafafa; padding-bottom: 2em; padding-top: 2em;">
+    <div class="container" style="display: flex; flex-direction: column; align-items: center; gap: 0.5em;">
+        <div>
+            <a href="https://www.cemosis.fr">
+                <img src="../_/img/cemosis-logo.svg" alt="Cemosis logo" height="50">
+            </a>
+        </div>
+        <span style="font-size: 0.8rem; color: #9e9e9e">© 2023 <a href="https://www.cemosis.fr" style="text-decoration: underline;">Cemosis</a>, Université de Strasbourg</span>
+    </div>
+</footer>
+<script id="site-script" src="../_/js/site.js" data-ui-root-path="../_"></script>
+
+
+<script async src="../_/js/vendor/fontawesome-icon-defs.js"></script>
+<script async src="../_/js/vendor/fontawesome.js"></script>
+<script async src="../_/js/vendor/highlight.js"></script>
+
+
+<script type="text/javascript">
+function toggleFullScreen() {
+   var doc = window.document;
+   var docEl = doc.documentElement;
+
+   var requestFullScreen = docEl.requestFullscreen || docEl.mozRequestFullScreen || docEl.webkitRequestFullScreen || docEl.msRequestFullscreen;
+   var cancelFullScreen = doc.exitFullscreen || doc.mozCancelFullScreen || doc.webkitExitFullscreen || doc.msExitFullscreen;
+
+   if(!doc.fullscreenElement && !doc.mozFullScreenElement && !doc.webkitFullscreenElement && !doc.msFullscreenElement) {
+       requestFullScreen.call(docEl);
+   }
+   else {
+       cancelFullScreen.call(doc);
+   }
+}
+</script>
+  </body>
+</html>
diff --git a/parallel-programming/PPChapter4.html b/parallel-programming/PPChapter4.html
new file mode 100644
index 0000000..151841f
--- /dev/null
+++ b/parallel-programming/PPChapter4.html
@@ -0,0 +1,1313 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width,initial-scale=1">
+    <title>Untitled :: Parallel Programming</title>
+    <link rel="canonical" href="https://feelpp.github.io/parallel-programming/parallel-programming/PPChapter4.html">
+    <meta name="generator" content="Antora 3.1.3">
+    <link rel="stylesheet" href="../_/css/site.css">
+<link rel="icon" href="../_/img/favicon.ico" type="image/x-icon">
+<script>!function(l,p){if(l.protocol!==p&&l.host=="docs.antora.org"){l.protocol=p}else if(/\.gitlab\.io$/.test(l.host)){l.replace(p+"//docs.antora.org"+l.pathname.substr(l.pathname.indexOf("/",1))+l.search+l.hash)}}(location,"https:")</script>
+
+<script src="../_/js/vendor/tabs-block-extension.js"></script>
+<script src="../_/js/vendor/tabs-block-behavior.js"></script>
+
+
+
+<script type="text/x-mathjax-config">
+MathJax.Hub.Config({
+  messageStyle: "none",
+  tex2jax: {
+    inlineMath: [['$','$'], ['\\(','\\)']],
+    displayMath: [['$$','$$'], ['\\[','\\]']],
+    processEscapes: true,
+    processEnvironments: true,
+    ignoreClass: "nostem|nolatexmath"
+  },
+  asciimath2jax: {
+    delimiters: [["\\$", "\\$"]],
+    ignoreClass: "nostem|noasciimath"
+  },
+
+  TeX: {
+      Macros: {
+      bold: ["{\\bf #1}",1],
+      calTh: "{\\mathcal{T}_h}",
+      card: ["{\\operatorname{card}(#1)}",1],
+      card: ["{\\operatorname{card}(#1)}",1],
+      Ck: ["{\\mathcal{C}^{#1}}",1],
+      deformt: ["{\\mathbf{\\varepsilon(#1)}}",1],
+      diam: "{\\operatorname{diam}}",
+      dim: ["{\\operatorname{dim}(#1)}",1],
+      disp: ["{\\mathbf{#1}}",1],
+      domain: "{\\Omega}",
+      ds: "",
+      essinf: "{\\operatorname{ess}\\, \\operatorname{inf}}",
+      F:"{\\mathcal{F}}",
+      geo: "{\\mathrm{geo}}",
+      Ich: ["{\\mathcal{I}^{#1}_{c,h}#2}",2],
+      Id: "{\\mathcal{I}}",
+      Ilag: ["{\\mathcal{I}^{\\mathrm{lag}}_{#1}}",1],
+      jump: ["{[\\![ #1 ]\\!]}",1],
+      n:"{\\mathbf{n}}",
+      Ne: "{N_{\\mathrm{e}}}",
+      Next: "{\\mathrm{n}}",
+      nf: "{n_f}",
+      ngeo: "{n_{\\mathrm{geo}}}",
+      Nma: "{N_{\\mathrm{ma}}}",
+      NN: "{\\mathbb N}",
+      Nno: "{N_{\\mathrm{no}}}",
+      Nso: "{N_{\\mathrm{so}}}",
+      opdim: "{\\operatorname{dim}}",
+      p: "{\\mathrm{p}}",
+      P:"{\\mathcal{P}}",
+      Pch: ["{P^{#1}_{c,h}}",1],
+      Pcho: ["{P^{#1}_{c,h,0}}",1],
+      Pk: ["{\\mathcal{P}^{#1}}",1],
+      poly: ["{\\mathbb{#1}",1],
+      poly: ["{\\mathbb{#1}}",1],
+      prect: ["{\\left\\(#1\\right\\)}",1],
+      q:"{\\mathbf{q}}",
+      Qch: ["{Q^{#1}_{c,h}}",1],
+      Qk: ["{\\mathcal{Q}^{#1}}",1],
+      R: ["{\\mathbb{R}^{#1}}",1],
+      RR: "{\\mathbb R}",
+      set: ["{\\left\\{#1\\right\\}}",1],
+      stresst: ["{\\mathbf{\\sigma(#1)}}",1],
+      T:"{\\mathcal{T}}",
+      tr: "{\\operatorname{tr}}",
+      v:"{\\mathbf{v}}",
+      vertiii: ["\\left\\vert\\kern-0.25ex\\left\\vert\\kern-0.25ex\\left\\vert #1 \\right\\vert\\kern-0.25ex\\right\\vert\\kern-0.25ex\\right\\vert",1]
+  },
+  extensions: ["mhchem.js"] 
+  }
+});
+</script>
+<!--<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.3/MathJax.js?config=TeX-MML-AM_CHTML"></script>-->
+<!-- <script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-MML-AM_CHTML' async></script> -->
+<script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-AMS_CHTML'></script>
+<!--<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.6.0/MathJax.js?config=TeX-MML-AM_HTMLorMML"></script>-->
+
+<!--<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.css" integrity="sha384-TEMocfGvRuD1rIAacqrknm5BQZ7W7uWitoih+jMNFXQIbNl16bO8OZmylH/Vi/Ei" crossorigin="anonymous">
+<script src="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.js" integrity="sha384-jmxIlussZWB7qCuB+PgKG1uLjjxbVVIayPJwi6cG6Zb4YKq0JIw+OMnkkEC7kYCq" crossorigin="anonymous"></script>-->
+<script>var uiRootPath = '../_'</script>
+
+  </head>
+  <body class="article">
+<header class="header">
+    <nav class="navbar navbar-expand-sm bg-dark navbar-dark navbar-template-project" style="border-top: 4px solid #9E9E9E">
+        <div class="navbar-brand">
+            <div class="navbar-item feelpp-logo">
+                <a href="https://feelpp.github.io/parallel-programming">Parallel Programming</a>
+            </div>
+            <button class="navbar-burger" data-target="topbar-nav">
+                <span></span>
+                <span></span>
+                <span></span>
+            </button>
+        </div>
+
+        <div id="topbar-nav" class="navbar-menu">
+            <div class="navbar-end">
+                <div class="navbar-item">
+                    <a href="https://docs.feelpp.org/">Documentation Reference</a>
+                </div>
+                <div class="navbar-item has-dropdown is-hoverable download-item">
+                    <div class="navbar-item"><a href="https://docs.feelpp.org/user/latest/install/index.html" class="download-btn">Get Feel++</a></div>
+                </div>
+                <div class="navbar-item">
+                    <a class="navbar-brand"  href="https://www.cemosis.fr">
+                        <img class="cemosis-logo"  src="../_/img/cemosis-logo.svg" alt="Cemosis logo"/>
+                    </a>
+                </div>
+            </div>
+        </div>
+    </nav>
+</header>
+<div class="body">
+<a href="#" class="menu-expand-toggle"></a>
+<div class="nav-container" data-component="parallel-programming" data-version="">
+  <aside class="nav">
+    <div class="panels">
+<div class="nav-panel-menu is-active" data-panel="menu">
+  <nav class="nav-menu">
+    <h3 class="title"><a href="index.html">Template Project</a></h3>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="0">
+<ul class="nav-list">
+  <li class="nav-item" data-depth="1">
+    <a class="nav-link" href="index.html">Introduction</a>
+  </li>
+  <li class="nav-item" data-depth="1">
+    <button class="nav-item-toggle"></button>
+    <span class="nav-text">{Parallel Programming} Environment</span>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item is-current-page" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="cmake.html">cmake environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="antora.html">antora environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="vscode.html">vscode integration</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="githubactions.html">Github Actions</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="rename.html">Renaming the project</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="jupyter.html">Jupyter Notebook</a>
+  </li>
+</ul>
+  </li>
+</ul>
+  </li>
+</ul>
+  </nav>
+</div>
+<div class="nav-panel-explore" data-panel="explore">
+  <div class="context">
+    <span class="title">Template Project</span>
+    <span class="version"></span>
+  </div>
+  <ul class="components">
+      <li class="component">
+        <a class="title" href="../feelpp-antora-ui/index.html">Antora Feel++ UI</a>
+      </li>
+      <li class="component is-current">
+        <a class="title" href="index.html">Template Project</a>
+      </li>
+  </ul>
+</div>
+    </div>
+  </aside>
+</div>
+<main class="article">
+<div class="toolbar" role="navigation">
+  <button class="nav-toggle"></button>
+    <a href="index.html" class="home-link"></a>
+  <nav class="breadcrumbs" aria-label="breadcrumbs">
+  <ul>
+    <li><a href="index.html">Template Project</a></li>
+    <li>{Parallel Programming} Environment</li>
+    <li><a href="PPChapter4.html">Specx</a></li>
+  </ul>
+</nav>
+
+  
+    <div class="edit-this-page"><a href="https://github.com/feelpp/parallel-programming/edit/lem/docs/modules/ROOT/pages/PPChapter4.adoc">Edit this Page</a></div>
+  
+  <div class="page-downloads">
+  <span class="label">Download as</span>
+  <ul class="download-options">
+    <li>
+      <a onclick="print(this)" href="#" data-toggle="tooltip" data-placement="left" title="Print to PDF"
+         class="pdf-download">
+        <img class="pdf-file-icon icon" src="../_/img/pdf.svg"/> .pdf
+      </a>
+    </li>
+  </ul>
+</div>
+</div>
+
+  <div class="content">
+<aside class="toc sidebar" data-title="Contents" data-levels="2">
+  <div class="toc-menu"></div>
+</aside>
+<article class="doc">
+<div class="sect2">
+<h3 id="_4_specx"><a class="anchor" href="#_4_specx"></a>1. 4.Specx</h3>
+<div class="paragraph">
+<p><strong>SPECX</strong> is a task-based execution system. It shares many similarities
+with StarPU but is written in modern C++. It also supports speculative
+execution, which is the ability to run tasks ahead of time if others are
+unsure about changing the data.</p>
+</div>
+<div class="paragraph">
+<p><span class="image unresolved"><img src="../assests/images/image21.png" alt="image" width="642" height="380"></span></p>
+</div>
+<div class="paragraph">
+<p><strong>4.1 Workflow</strong></p>
+</div>
+<div class="ulist">
+<ul>
+<li>
+<p><strong>Execution interface:</strong> Provides functionality for creating tasks, task
+graphs and generating traces. Can be used to specify speculation model</p>
+</li>
+<li>
+<p><strong>Data Dependency Interface:</strong> Forms a collection of objects that can be
+used to express data dependencies. Also provides wrapper objects that
+can be used to specify whether a given callable should be considered CPU
+or GPU code</p>
+</li>
+<li>
+<p><strong>Task visualization interface:</strong> Specifies the ways to interact with
+the task object.</p>
+</li>
+</ul>
+</div>
+<div class="paragraph">
+<p><strong>4.1.1 Runtime interface</strong></p>
+</div>
+<div class="paragraph">
+<p>Runtime functionality is exposed through a class called SpRuntime . This
+class provides functionality for creating tasks, task graphs, and
+generating traces.</p>
+</div>
+<div class="paragraph">
+<p>The SpRuntime class is modeled on a non-type parameter which can be used
+to specify the speculation model you want to use. This parameter can
+take one of three values (we currently support three different
+speculation models) defined in <span class="underline">SpSpeculativeModel.hpp</span> . By
+default, the runtime uses the first speculation model.</p>
+</div>
+<div class="paragraph">
+<p><strong>Main SpRuntime methods:</strong> SpRuntime(const inNumThreads)</p>
+</div>
+<div class="paragraph">
+<p>Currently, each instance of SpRuntime has its own thread pool to
+distribute its work on. <strong>In the future, we plan to separate thread
+management from execution.</strong> The runtime constructor takes as a parameter
+the number of threads it must spawn. By default , the parameter is
+initialized to the number indicated by the OMP_NUM_THREADS environment
+variable. If the environment variable is not set, the setting defaults
+to the number of concurrent threads supported by the hardware. The
+constructor spawns the new threads. <strong>At this time, we do not allow
+manual binding of threads to cores.</strong></p>
+</div>
+<div class="paragraph">
+<p>For now, the runtime will bind threads to cores by thread index if the
+OMP_PROC_BIND environment variable is set to TRUE (or true or 1 ) or if
+inNumThreads is less than or equal to the number of concurrent threads
+supported by the material.</p>
+</div>
+<div class="paragraph">
+<p>autotask([optional] SpPriority inPriority, [optional] SpProbability
+inProbability, [optional] &lt;DataDependencyTy&gt; ddo&#8230;&#8203;, &lt;CallableTy&gt; c) (1)</p>
+</div>
+<div class="paragraph">
+<p>autotask([optional] SpPriority inPriority, [optional] SpProbability
+inProbability, [optional] &lt;DataDependencyTy&gt; ddo&#8230;&#8203;,
+SpCpuCode(&lt;CallableTy&gt; c1), [optional] SpGpuCode(&lt;CallableTy&gt; c2)) (2)</p>
+</div>
+<div class="paragraph">
+<p>This method creates a new task and injects it into the runtime. It
+returns an object representing the newly created task.</p>
+</div>
+<div class="paragraph">
+<p><strong>inPriority</strong> parameter specifies a priority for the task.</p>
+</div>
+<div class="paragraph">
+<p><strong>inProbability</strong> parameter is an object used to specify the probability
+with which the task can write to its writeable data dependencies.</p>
+</div>
+<div class="paragraph">
+<p>After the inProbability parameter is a list of data dependency objects.
+This list declares the task&#8217;s data dependencies. <strong>At this time we only
+allow one type of data dependency to be declared for a given data item
+and a data dependency declaration of a certain type for a particular
+data item should only appear once times, except for atomic read and
+write dependencies.</strong></p>
+</div>
+<div class="paragraph">
+<p>For example, you cannot have a read and write dependency for the same
+data item (in this case, you should only declare the strongest type of
+dependency which is write). The validity of dependencies is checked at
+runtime. If you declared two data dependencies on different expressions
+but evaluated on the same data item, the program will exit.</p>
+</div>
+<div class="paragraph">
+<p>The last or two last arguments (depending on which overload the call
+resolves to) specifies (a) callable(s) embedding the code the task
+should execute. Callables can be lambda expressions or functors. The
+callable&#8217;s function call operator must have as many parameters as there
+are data dependency objects in the data dependency object list. All
+parameters must be of lvalue reference type, and the type of each
+parameter must be the same as the data item of the corresponding data
+dependency object in the data dependency object list (you can also type
+infer the type with auto). Parameters must appear in the same order as
+they appear in the data dependency list.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Example:</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Type1 v1;
+Type2 v2;
+runtime. task ( SpRead(v1), SpWrite(v2),
+[] (const Type1 &amp;paramV1, Type2 &amp;paramV2) {
+if(paramV1.test()) { paramV2.set(1); } else { paramV2.set(2);} }
+);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Parameters corresponding to an SpRead data dependency object must be
+declared const (paramV1 in the example given above). The code inside the
+callable should refer to the parameter names rather than the original
+variable names. In the example given above, the code in the lambda body
+references the names paramV1 and paramV2 to refer to the data values v1
+and v2 rather than v1 and v2. You should not capture v1 and v2 by
+reference and work with v1 and v2 directly. However, you can capture any
+variable that does not appear in the data dependency list and work with
+it directly. The runtime will store the addresses of the data items
+appearing in the data dependency list and take care of calling the
+callable with the appropriate matching arguments. In the example given
+above, assuming the task call is the only task call in the entire
+program, the runtime will take the addresses of v1 and v2 (since those
+are the data items that appear in the data dependency list) and when the
+task runs it will call the lambda with the arguments *v1 and *v2. Note
+that since Specx is a speculative task-based runtime system, there will
+also be times when the callable is called with copies of the data items
+(sampled at different times) rather than the original data items.</p>
+</div>
+<div class="paragraph">
+<p>Callables for normal tasks can return any value. Callables for potential
+tasks must all return a boolean, however. This boolean is used to inform
+the runtime whether the task has written to its data dependencies may
+write or not. The callable&#8217;s code should correctly return true or false
+depending on the situation. It should return true if the task has
+written to its data dependencies maybe write and false otherwise.</p>
+</div>
+<div class="paragraph">
+<p>In overload (1), the callable is passed as is to the task call. It will
+be implicitly interpreted by the runtime as CPU code. In overload (2),
+the callable c1 is explicitly labeled as CPU code by being wrapped in an
+SpCpuCode object (see the subsection on callable wrapper objects in the
+Data Dependency Interface section below). Overload (2) further allows
+the user to provide a GPU version of the code (in this case the callable
+must be wrapped in an SpGpuCode object). When the CPU and GPU versions
+of the code are provided, the Specx runtime will decide at runtime which
+of the two to run.</p>
+</div>
+<div class="paragraph">
+<p>void setSpeculationTest(std::function&lt;bool(int,const SpProbability&amp;)&gt;
+inFormula)</p>
+</div>
+<div class="paragraph">
+<p>This method defines a predicate function that will be called by the
+runtime whenever a speculative task is ready to be placed in the task
+ready queue (i.e. all its data dependencies are ready ). The predicate
+is used to decide, based on runtime information, whether the speculative
+task as well as any of its dependent speculative tasks should be allowed
+to run. The predicate returns a boolean. A return value of true means
+that the speculative task and all of its dependent speculative tasks are
+allowed to run. Conversely, a return value of false means that the
+speculative task and all of its dependent speculative tasks should be
+disabled.</p>
+</div>
+<div class="paragraph">
+<p>Note that although a speculative task may be allowed to run, this does
+not necessarily mean that it will actually run. For a speculative task
+to actually execute all of the parent speculations it speculates on,
+they must not have failed. It may be that between the time the
+speculative task has been marked as allowed to run and the time it is
+actually picked up by a thread for execution, some of the parent
+speculations have failed and therefore it will not be executed even
+though it was allowed to run depending on the result of the predicate
+evaluation in the past.</p>
+</div>
+<div class="paragraph">
+<p>The two predicate arguments are provided by the runtime. The first
+parameter is the number of tasks that were in the ready queue when the
+predicate was called. The second parameter is a probability whose value
+is the average of all probabilities of all speculative tasks dependent
+on the speculative task for which the predicate is called and the
+probability of the speculative task for which the predicate is called.
+Based on these two parameters, one can write his own custom logic to
+enable/disable speculative tasks. For example, you can decide to
+deactivate a speculative task if the average probability exceeds a
+certain threshold (because it may not make much sense to continue
+speculating if the chances of failure are high). <strong>The prototype of the
+predicate might change in the future as we might want to consider
+additional or different data to make the decision.</strong></p>
+</div>
+<div class="paragraph">
+<p>If no speculation test is defined in the runtime, the default behavior
+is that a speculative task and all its dependent speculative tasks will
+only be activated if, at the time the predicate is called, no other task
+is ready to run.</p>
+</div>
+<div class="paragraph">
+<p>void waitAllTasks()</p>
+</div>
+<div class="paragraph">
+<p>This method is a blocking call that waits for all tasks that have been
+pushed to run up to this point to complete.</p>
+</div>
+<div class="paragraph">
+<p>void waitRemain(const long int windowSize)</p>
+</div>
+<div class="paragraph">
+<p>This method is a blocking call that waits for the number of unprocessed
+tasks to become less than or equal to windowSize.</p>
+</div>
+<div class="paragraph">
+<p>void stopAllThreads()</p>
+</div>
+<div class="paragraph">
+<p>This method is a blocking call that causes execution threads to close.
+The method expects all tasks to have already completed, so you should
+always call waitAllTasks() before calling this method.</p>
+</div>
+<div class="paragraph">
+<p>int getNbThreads()</p>
+</div>
+<div class="paragraph">
+<p>This method returns the size of the execution thread pool (in number of
+threads).</p>
+</div>
+<div class="paragraph">
+<p>void generateDot(const std::string&amp; outputFilename, bool printAccesses)</p>
+</div>
+<div class="paragraph">
+<p>This method will generate the task graph corresponding to the execution
+in point format. It will write its output to the outputFilename path.
+The boolean printAccesses can be set to true if you want to print the
+tasks memory accesses (only the memory accesses specified in their data
+dependency list will be printed) in the tasks node body. By default,
+printAccesses is set to false.</p>
+</div>
+<div class="paragraph">
+<p>The names of the tasks will be printed in the nodes of the graph. The
+default name will be displayed for each task unless another name has
+been manually defined by the user (see Task Viewer Interface section
+below). Speculative versions of tasks will have an apostrophe appended
+to their name. You can view the task graph in pdf format using the
+following command:</p>
+</div>
+<div class="paragraph">
+<p>dot -Tpdf -o &lt;pdf_output_filename&gt; &lt;path_to_dot_output_file&gt;</p>
+</div>
+<div class="paragraph">
+<p>The generateDot method should be called after calling waitAllTasks() and
+stopAllThreads().</p>
+</div>
+<div class="paragraph">
+<p>void generateTrace(const std::string&amp; outputFilename, const bool
+showDependencies)</p>
+</div>
+<div class="paragraph">
+<p>This method will generate a trace of the execution (with timings and
+dependencies) in svg format. The generateTrace method should only be
+called after calling waitAllTasks() and stopAllThreads().</p>
+</div>
+<div class="paragraph">
+<p><strong>4.1.2 Data dependency interface</strong></p>
+</div>
+<div class="paragraph">
+<p>The data dependency interface forms a collection of objects that can be
+used to express data dependencies. It also provides wrapper objects that
+can be used to specify whether a given callable should be considered CPU
+or GPU code. The class definition for these objects is in
+<span class="underline">Src/Utils/SpModes.hpp</span> .</p>
+</div>
+<div class="paragraph">
+<p><strong>Data dependency objects</strong></p>
+</div>
+<div class="paragraph">
+<p>Specifying data dependencies amounts to constructing the relevant data
+dependency objects from the data lvalues.</p>
+</div>
+<div class="paragraph">
+<p><strong>Scalar data</strong></p>
+</div>
+<div class="paragraph">
+<p>S pRead(x) // Specifies a read dependency on x. Read requests are always
+satisfied by default, i.e. a read request rr2 on data x immediately
+following another read request rr1 on data x need not wait until rr1 be
+satisfied to be served. Several successive read accesses will be
+performed in any order and/or at the same time. Reads are ordered by the
+runtime with respect to writes, maybe writes, commutative writes, and
+atomic writes. The order is the order in which data accesses were
+requested at runtime.</p>
+</div>
+<div class="paragraph">
+<p>SpWrite(x) // Specifies a write dependency on x indicating that data x
+will be written with 100% certainty. Several successive write requests
+on given data x will be satisfied one after the other in the order in
+which they were issued during execution. Writes are categorized by the
+runtime into reads, writes, maybe writes, commutative writes, and atomic
+writes. The order is the order in which data accesses were requested at
+runtime.</p>
+</div>
+<div class="paragraph">
+<p>SpMaybeWrite(x) // Specifies a possibly writeable dependency indicating
+that data x can be written, i.e. it will not always be the case (writes
+can occur with some probability). Several possibly successive write
+requests on given data x will be satisfied one after the other in the
+order in which they were issued at runtime. Maybe writes are categorized
+by the runtime into reads, writes, maybe writes, commutative writes, and
+atomic writes. The order is the order in which data accesses were
+requested at runtime.</p>
+</div>
+<div class="paragraph">
+<p>SpCommutativeWrite(x) // Specifies a commutative write dependency on x,
+ie writes that can be performed in any order. Several successive
+commutative write requests will be satisfied one after the other in any
+order: while a commutative write request cw1 on data x is currently
+being processed, all immediately following commutative write requests on
+data x given x will be put on hold. When cw1 is released, one of the
+immediately following commutative write requests will be serviced. No
+order is applied by the runtime as to which one will be served next. For
+example, if two commutative tasks write to data x, the runtime does not
+impose an order as to which tasks should write first. However, the two
+tasks will not be able to run in parallel: while one of the two tasks is
+running and writing to data x, the other task will not be able to run
+because its write dependency request commutative will not be processed
+until the first task has finished executing and has released its
+commutative write dependency on x. Commutative writes are classified by
+the runtime into reads, writes, maybe writes, and atomic writes. The
+order is the order in which data accesses were requested at runtime.</p>
+</div>
+<div class="paragraph">
+<p>SpAtomicWrite(x) // Specifies an atomic write dependency on x. Atomic
+write requests are always satisfied by default, i.e. an awr2 atomic
+write request on data x immediately following another awr1 atomic write
+request on data x does not have need to wait for awr1 to be satisfied to
+be served. Several successive atomic writes will be performed in any
+order. For example, if two tasks write atomically to the data x, the
+runtime does not impose an order as to which tasks should write
+atomically first and the two tasks can run in parallel. Atomic writes
+will be committed to memory in the order in which they will be committed
+at runtime, the point is that the Specx runtime does not impose an order
+on atomic writes. Atomic writes are classified by the runtime into
+reads, writes, maybe writes, and commutative writes. The order is the
+order in which data accesses were requested at runtime. All data
+dependency constructors for scalar data must receive an lvalue as an
+argument.</p>
+</div>
+<div class="paragraph">
+<p><strong>Non-scalar data</strong></p>
+</div>
+<div class="paragraph">
+<p>We also provide analogous constructors for aggregating data values from
+arrays:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpReadArray(&lt;XTy&gt; *x, &lt;ViewTy&gt; view)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpWriteArray(&lt;XTy&gt; *x, view&lt;ViewTy&gt;)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpMaybeWriteArray(&lt;XTy&gt; *x, &lt;ViewTy&gt; view)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpCommutativeWriteArray(&lt;XTy&gt; *x, view&lt;ViewTy&gt;)</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpAtomicWriteArray(&lt;XTy&gt; *x, view &lt;ViewTy&gt;)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>x must be a pointer to a contiguous buffer (the array).</p>
+</div>
+<div class="paragraph">
+<p>view must be an object representing the collection of specific indices
+of array elements that are affected by the dependency. It must be
+iterable (in the "stl iterable" sense). An example implementation of
+such a view class can be found in
+<span class="underline">Src/Utils/SpArrayView.hpp</span> .</p>
+</div>
+<div class="paragraph">
+<p><strong>Wrapper objects for callables</strong></p>
+</div>
+<div class="paragraph">
+<p>We provide two wrapper objects for callables whose purpose is to mark up
+a callable to inform the runtime system whether to interpret the given
+callable as CPU or GPU code:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpCpuCode(&lt;CallableTy&gt; c)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Specifies that the callable c represents CPU code.</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpGpuCode(&lt;CallableTy&gt; c)</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Specifies that the callable c represents GPU code.</p>
+</div>
+<div class="paragraph">
+<p>In both cases, the callable c can be a lambda or an lvalue or rvalue
+functor.</p>
+</div>
+<div class="paragraph">
+<p>A callable that appears as an argument to a call to the task method of
+an SpRuntime object without being wrapped in one of the above two
+objects will be interpreted by the runtime as CPU code by default.</p>
+</div>
+<div class="paragraph">
+<p><strong>4.1.3 Task visualization interface</strong></p>
+</div>
+<div class="paragraph">
+<p>The Task Viewer interface specifies ways to interact with the task
+object returned by SpRuntime&#8217;s task method. The exact type returned by
+SpRuntime&#8217;s task method doesn&#8217;t matter and in practice it should be
+inferred from the (auto) type in your programs. You can, however, find
+the definition of the returned type in
+<span class="underline">Src/Tasks/SpAbstractTask.hpp</span> .</p>
+</div>
+<div class="paragraph">
+<p><strong>Main methods available on task objects returned by task calls</strong></p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>bool isOver() // Returns true if the task has finished executing.</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>Void wait() //This method is a blocking call that waits for the task to
+complete.</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>&lt;ReturnType&gt; getValue() // This method is a blocking call that retrieves
+the task's result value (if it has any). It first waits for the task to
+complete and then retrieves the result value.</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void setTaskName(const std::string&amp; inTaskName) // Assign the name
+inTaskName to the task. This change will be reflected in debug
+printouts, task graph, and trace generation output. By default, the task
+will be named as the dismembered string of the typeid name of the task's
+callable.</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>std::string getTaskName() // Get the task name. <strong>Speculative versions of
+tasks will have an apostrophe appended to their name.</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>GPU/CUDA (work in progress)</strong></p>
+</div>
+<div class="paragraph">
+<p>The CMake variable SPECX_COMPILE_WITH_CUDA must be set to ON, for
+example with the command cmake .. -DSPECX_COMPILE_WITH_CUDA=ON . If
+CMake is unable to find nvcc, set the CUDACXX environment variable or
+the CMake variable CMAKE_CUDA_COMPILER to the path to nvcc. You can
+define CMAKE_CUDA_ARCHITECTURES to select the CUDA sm to compile.</p>
+</div>
+<div class="paragraph">
+<p>Here is an example job on CUDA GPU:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>tg.task(SpWrite(a),// Dependencies are expressed as usual</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpCuda([](SpDeviceDataView&lt;std::vector&lt;int&gt;&gt; paramA) { // Each
+parameter is converted into a SpDeviceDataView</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>// The kernel call is called using the dedicated stream</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>inc_var&lt;&lt;&lt;1,1,0,SpCudaUtils::GetCurrentStream()&gt;&gt;&gt;(paramA.array(),</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>paramA.nbElements());</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>})</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Currently, the call to a CUDA kernel must be done in a .cu file. There
+are three types of SpDeviceDataView that provide different methods: one
+for is_trivially_copyable objects, one for std::vectors of
+is_trivially_copyable objects, and one user-customized. At the latest,
+it is requested to provide the following methods:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>std::size_t memmovNeededSize() const{</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>pattern &lt;DeviceMemmov class&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void memmovHostToDevice(DeviceMemmov&amp; mover, void* devicePtr,
+std::size_t size){
+...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>pattern &lt;DeviceMemmov class&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void memmovDeviceToHost(DeviceMemmov&amp; mover, void* devicePtr,
+std::size_t size){
+...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>self getDeviceDataDescription() const{
+...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The type returned by getDeviceDataDescription must be copyable and have
+an empty constructor. It should be used to help retrieve raw pointer
+data when calling a device kernel.</p>
+</div>
+<div class="paragraph">
+<p><strong>GPU/HIP (work in progress)</strong></p>
+</div>
+<div class="paragraph">
+<p>The CMake variable SPECX_COMPILE_WITH_HIP must be set to ON, for example
+with the command cmake .. -DSPECX_COMPILE_WITH_HIP=ON . The C++ compiler
+must also be defined with for example CXX=hipcc , so a working command
+line must be CXX=hipcc cmake .. -DSPECX_COMPILE_WITH_HIP=ON . You can
+set GPU_TARGETS to select the HIP sm to compile.</p>
+</div>
+<div class="paragraph">
+<p>Here is an example of a task on a HIP GPU:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>tg.task(SpWrite(a),// Dependencies are expressed as usual</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpHip([](SpDeviceDataView&lt;std::vector&lt;int&gt;&gt; paramA) { // Each parameter
+is converted into a SpDeviceDataView</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>// The kernel call is called using the dedicated stream</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>inc_var&lt;&lt;&lt;1,1,0,SpHipUtils::GetCurrentStream()&gt;&gt;&gt;(paramA.array(),</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>paramA.nbElements());</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>})</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>Currently, the call to a HIP kernel must be done in a .cu file. There
+are three types of SpDeviceDataView that provide different methods: one
+for is_trivially_copyable objects, one for std::vectors of
+is_trivially_copyable objects, and one user-customized. At the latest,
+it is requested to provide the following methods:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>std::size_t memmovNeededSize() const{</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>pattern &lt;DeviceMemmov class&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void memmovHostToDevice(DeviceMemmov&amp; mover, void* devicePtr,
+std::size_t size){</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>pattern &lt;DeviceMemmov class&gt;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void memmovDeviceToHost(DeviceMemmov&amp; mover, void* devicePtr,
+std::size_t size){</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>self getDeviceDataDescription() const{</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="paragraph">
+<p>The type returned by getDeviceDataDescription must be copyable and have
+an empty constructor. It should be used to help retrieve raw pointer
+data when calling a device kernel.</p>
+</div>
+<div class="paragraph">
+<p><strong>MPI</strong></p>
+</div>
+<div class="paragraph">
+<p>The CMake variable SPECX_COMPILE_WITH_MPI must be set to ON, for example
+with the command cmake .. -DSPECX_COMPILE_WITH_MPI=ON .</p>
+</div>
+<div class="paragraph">
+<p><strong>Data serialization and deserialization</strong></p>
+</div>
+<div class="paragraph">
+<p>Data can be sent to target MPI processes using the mpiSend and mpiRecv
+methods of the SpTaskGraph object.</p>
+</div>
+<div class="paragraph">
+<p>To be moved between compute nodes, objects must be one of the following
+types:</p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Be an instance of a class that inherits from SpAbstractSerializable
+(see below for details).</p>
+</li>
+<li>
+<p>Supports getRawDataSize , getRawData and restoreRawData methods, which
+will be used to extract the data to send and restore it.</p>
+</li>
+<li>
+<p>Be a POD type (well, having is_standard_layout_v and is_trivial_v
+returning true, which means having a pointer in a structure won&#8217;t be
+detected and could be a problem).</p>
+</li>
+<li>
+<p>Let be a vector of the types defined in 1, 2 or 3.</p>
+</li>
+</ol>
+</div>
+<div class="paragraph">
+<p>It is the SpGetSerializationType function that performs the detection
+and assigns the corresponding SpSerializationType value to each object.
+Detection is carried out in the order written above.</p>
+</div>
+<div class="paragraph">
+<p>For examples, see the unit tests under UTests/MPI.</p>
+</div>
+<div class="paragraph">
+<p><strong>Type 3 - PODs</strong></p>
+</div>
+<div class="paragraph">
+<p>For built-in and POD types, these methods work automatically:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpTaskGraph&lt;SpSpeculativeModel::SP_NO_SPEC&gt; tg;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int a = 1;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>integer b = 0;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>tg.mpiSend(b, 1, 0);</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>tg.mpiRecv(b, 1, 1);</pre>
+</div>
+</div>
+<div class="paragraph">
+<p><strong>Type 1 - SpAbstractSerializable</strong></p>
+</div>
+<div class="paragraph">
+<p>However, user-defined types must allow support for MPI serialization and
+deserialization. To do this, they must implement these steps.</p>
+</div>
+<div class="olist arabic">
+<ol class="arabic">
+<li>
+<p>Include "MPI/SpSerializer.hpp"</p>
+</li>
+<li>
+<p>Make the class a public subclass of the SpAbstractSerializable class</p>
+</li>
+<li>
+<p>Provide a constructor that takes as an argument a non-constant
+reference to SpDeserializer. This constructor makes it possible to
+construct an object of the class from deserialization.</p>
+</li>
+<li>
+<p>Provide a public "serialize" method with a non-const reference to
+SpSerializer as an argument. This method serializes the object into the
+SpSerializer input object.</p>
+</li>
+</ol>
+</div>
+<div class="paragraph">
+<p>These detailed steps are illustrated in the following example:</p>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>#include "MPI/SpSerializer.hpp"</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int_data_holder class: public SpAbstractSerializable {</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>audience:</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int_data_holder(int value = 0): value{value} {}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int_data_holder(SpDeserializer &amp;deserializer) :
+value(deserializer.restore&lt;decltype(value)&gt;("value")) {</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void serialize(SpSerializer &amp;serializer) const final {</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>serializer.append(value, "value");</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int get() const { return value; }</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>empty set (int value) {</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>this-&gt;value=value;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>private:</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>integer value;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>};</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>...</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>SpTaskGraph&lt;SpSpeculativeModel::SP_NO_SPEC&gt; tg;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int_data_holder a=1;
+int_data_holder b=0;
+...
+tg.mpiSend(b, 1, 0);
+tg.mpiRecv(b, 1, 1);</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>*Type 2 - Direct access*</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>class DirectAccessClass {</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>int key;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>audience:</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>const unsigned character* getRawData() const {</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>return reinterpret_cast&lt;const unsigned char*&gt;(&amp;key);</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>std::size_t getRawDataSize() const {</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>return sizeof(key);</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>void restoreRawData(const unsigned char* ptr, std::size_t size){</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>assert(sizeof(key) == size);</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>key = *reinterpret_cast&lt;const int*&gt;(ptr);</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>integer&amp; value(){</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>return key;</pre>
+</div>
+</div>
+<div class="literalblock">
+<div class="content">
+<pre>}
+const int&amp; value() const{
+return key;
+}
+};</pre>
+</div>
+</div>
+</div>
+</article>
+  </div>
+</main>
+</div>
+<footer class="footer" style="border-top: 2px solid #e9e9e9; background-color: #fafafa; padding-bottom: 2em; padding-top: 2em;">
+    <div class="container" style="display: flex; flex-direction: column; align-items: center; gap: 0.5em;">
+        <div>
+            <a href="https://www.cemosis.fr">
+                <img src="../_/img/cemosis-logo.svg" alt="Cemosis logo" height="50">
+            </a>
+        </div>
+        <span style="font-size: 0.8rem; color: #9e9e9e">© 2023 <a href="https://www.cemosis.fr" style="text-decoration: underline;">Cemosis</a>, Université de Strasbourg</span>
+    </div>
+</footer>
+<script id="site-script" src="../_/js/site.js" data-ui-root-path="../_"></script>
+
+
+<script async src="../_/js/vendor/fontawesome-icon-defs.js"></script>
+<script async src="../_/js/vendor/fontawesome.js"></script>
+<script async src="../_/js/vendor/highlight.js"></script>
+
+
+<script type="text/javascript">
+function toggleFullScreen() {
+   var doc = window.document;
+   var docEl = doc.documentElement;
+
+   var requestFullScreen = docEl.requestFullscreen || docEl.mozRequestFullScreen || docEl.webkitRequestFullScreen || docEl.msRequestFullscreen;
+   var cancelFullScreen = doc.exitFullscreen || doc.mozCancelFullScreen || doc.webkitExitFullscreen || doc.msExitFullscreen;
+
+   if(!doc.fullscreenElement && !doc.mozFullScreenElement && !doc.webkitFullscreenElement && !doc.msFullscreenElement) {
+       requestFullScreen.call(docEl);
+   }
+   else {
+       cancelFullScreen.call(doc);
+   }
+}
+</script>
+  </body>
+</html>
diff --git a/parallel-programming/PPSummary.html b/parallel-programming/PPSummary.html
new file mode 100644
index 0000000..8f8d724
--- /dev/null
+++ b/parallel-programming/PPSummary.html
@@ -0,0 +1,336 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width,initial-scale=1">
+    <title>Untitled :: Parallel Programming</title>
+    <link rel="canonical" href="https://feelpp.github.io/parallel-programming/parallel-programming/PPSummary.html">
+    <meta name="generator" content="Antora 3.1.3">
+    <link rel="stylesheet" href="../_/css/site.css">
+<link rel="icon" href="../_/img/favicon.ico" type="image/x-icon">
+<script>!function(l,p){if(l.protocol!==p&&l.host=="docs.antora.org"){l.protocol=p}else if(/\.gitlab\.io$/.test(l.host)){l.replace(p+"//docs.antora.org"+l.pathname.substr(l.pathname.indexOf("/",1))+l.search+l.hash)}}(location,"https:")</script>
+
+<script src="../_/js/vendor/tabs-block-extension.js"></script>
+<script src="../_/js/vendor/tabs-block-behavior.js"></script>
+
+
+
+<script type="text/x-mathjax-config">
+MathJax.Hub.Config({
+  messageStyle: "none",
+  tex2jax: {
+    inlineMath: [['$','$'], ['\\(','\\)']],
+    displayMath: [['$$','$$'], ['\\[','\\]']],
+    processEscapes: true,
+    processEnvironments: true,
+    ignoreClass: "nostem|nolatexmath"
+  },
+  asciimath2jax: {
+    delimiters: [["\\$", "\\$"]],
+    ignoreClass: "nostem|noasciimath"
+  },
+
+  TeX: {
+      Macros: {
+      bold: ["{\\bf #1}",1],
+      calTh: "{\\mathcal{T}_h}",
+      card: ["{\\operatorname{card}(#1)}",1],
+      card: ["{\\operatorname{card}(#1)}",1],
+      Ck: ["{\\mathcal{C}^{#1}}",1],
+      deformt: ["{\\mathbf{\\varepsilon(#1)}}",1],
+      diam: "{\\operatorname{diam}}",
+      dim: ["{\\operatorname{dim}(#1)}",1],
+      disp: ["{\\mathbf{#1}}",1],
+      domain: "{\\Omega}",
+      ds: "",
+      essinf: "{\\operatorname{ess}\\, \\operatorname{inf}}",
+      F:"{\\mathcal{F}}",
+      geo: "{\\mathrm{geo}}",
+      Ich: ["{\\mathcal{I}^{#1}_{c,h}#2}",2],
+      Id: "{\\mathcal{I}}",
+      Ilag: ["{\\mathcal{I}^{\\mathrm{lag}}_{#1}}",1],
+      jump: ["{[\\![ #1 ]\\!]}",1],
+      n:"{\\mathbf{n}}",
+      Ne: "{N_{\\mathrm{e}}}",
+      Next: "{\\mathrm{n}}",
+      nf: "{n_f}",
+      ngeo: "{n_{\\mathrm{geo}}}",
+      Nma: "{N_{\\mathrm{ma}}}",
+      NN: "{\\mathbb N}",
+      Nno: "{N_{\\mathrm{no}}}",
+      Nso: "{N_{\\mathrm{so}}}",
+      opdim: "{\\operatorname{dim}}",
+      p: "{\\mathrm{p}}",
+      P:"{\\mathcal{P}}",
+      Pch: ["{P^{#1}_{c,h}}",1],
+      Pcho: ["{P^{#1}_{c,h,0}}",1],
+      Pk: ["{\\mathcal{P}^{#1}}",1],
+      poly: ["{\\mathbb{#1}",1],
+      poly: ["{\\mathbb{#1}}",1],
+      prect: ["{\\left\\(#1\\right\\)}",1],
+      q:"{\\mathbf{q}}",
+      Qch: ["{Q^{#1}_{c,h}}",1],
+      Qk: ["{\\mathcal{Q}^{#1}}",1],
+      R: ["{\\mathbb{R}^{#1}}",1],
+      RR: "{\\mathbb R}",
+      set: ["{\\left\\{#1\\right\\}}",1],
+      stresst: ["{\\mathbf{\\sigma(#1)}}",1],
+      T:"{\\mathcal{T}}",
+      tr: "{\\operatorname{tr}}",
+      v:"{\\mathbf{v}}",
+      vertiii: ["\\left\\vert\\kern-0.25ex\\left\\vert\\kern-0.25ex\\left\\vert #1 \\right\\vert\\kern-0.25ex\\right\\vert\\kern-0.25ex\\right\\vert",1]
+  },
+  extensions: ["mhchem.js"] 
+  }
+});
+</script>
+<!--<script type="text/javascript" async src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.3/MathJax.js?config=TeX-MML-AM_CHTML"></script>-->
+<!-- <script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=TeX-MML-AM_CHTML' async></script> -->
+<script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-AMS_CHTML'></script>
+<!--<script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.6.0/MathJax.js?config=TeX-MML-AM_HTMLorMML"></script>-->
+
+<!--<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.css" integrity="sha384-TEMocfGvRuD1rIAacqrknm5BQZ7W7uWitoih+jMNFXQIbNl16bO8OZmylH/Vi/Ei" crossorigin="anonymous">
+<script src="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.9.0/katex.min.js" integrity="sha384-jmxIlussZWB7qCuB+PgKG1uLjjxbVVIayPJwi6cG6Zb4YKq0JIw+OMnkkEC7kYCq" crossorigin="anonymous"></script>-->
+<script>var uiRootPath = '../_'</script>
+
+  </head>
+  <body class="article">
+<header class="header">
+    <nav class="navbar navbar-expand-sm bg-dark navbar-dark navbar-template-project" style="border-top: 4px solid #9E9E9E">
+        <div class="navbar-brand">
+            <div class="navbar-item feelpp-logo">
+                <a href="https://feelpp.github.io/parallel-programming">Parallel Programming</a>
+            </div>
+            <button class="navbar-burger" data-target="topbar-nav">
+                <span></span>
+                <span></span>
+                <span></span>
+            </button>
+        </div>
+
+        <div id="topbar-nav" class="navbar-menu">
+            <div class="navbar-end">
+                <div class="navbar-item">
+                    <a href="https://docs.feelpp.org/">Documentation Reference</a>
+                </div>
+                <div class="navbar-item has-dropdown is-hoverable download-item">
+                    <div class="navbar-item"><a href="https://docs.feelpp.org/user/latest/install/index.html" class="download-btn">Get Feel++</a></div>
+                </div>
+                <div class="navbar-item">
+                    <a class="navbar-brand"  href="https://www.cemosis.fr">
+                        <img class="cemosis-logo"  src="../_/img/cemosis-logo.svg" alt="Cemosis logo"/>
+                    </a>
+                </div>
+            </div>
+        </div>
+    </nav>
+</header>
+<div class="body">
+<a href="#" class="menu-expand-toggle"></a>
+<div class="nav-container" data-component="parallel-programming" data-version="">
+  <aside class="nav">
+    <div class="panels">
+<div class="nav-panel-menu is-active" data-panel="menu">
+  <nav class="nav-menu">
+    <h3 class="title"><a href="index.html">Template Project</a></h3>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="0">
+<ul class="nav-list">
+  <li class="nav-item" data-depth="1">
+    <a class="nav-link" href="index.html">Introduction</a>
+  </li>
+  <li class="nav-item" data-depth="1">
+    <button class="nav-item-toggle"></button>
+    <span class="nav-text">{Parallel Programming} Environment</span>
+<ul class="nav-list">
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="cmake.html">cmake environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="antora.html">antora environment</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="vscode.html">vscode integration</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="githubactions.html">Github Actions</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="rename.html">Renaming the project</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="jupyter.html">Jupyter Notebook</a>
+  </li>
+</ul>
+  </li>
+</ul>
+  </li>
+</ul>
+  </nav>
+</div>
+<div class="nav-panel-explore" data-panel="explore">
+  <div class="context">
+    <span class="title">Template Project</span>
+    <span class="version"></span>
+  </div>
+  <ul class="components">
+      <li class="component">
+        <a class="title" href="../feelpp-antora-ui/index.html">Antora Feel++ UI</a>
+      </li>
+      <li class="component is-current">
+        <a class="title" href="index.html">Template Project</a>
+      </li>
+  </ul>
+</div>
+    </div>
+  </aside>
+</div>
+<main class="article">
+<div class="toolbar" role="navigation">
+  <button class="nav-toggle"></button>
+    <a href="index.html" class="home-link"></a>
+  <nav class="breadcrumbs" aria-label="breadcrumbs">
+</nav>
+
+  
+    <div class="edit-this-page"><a href="https://github.com/feelpp/parallel-programming/edit/lem/docs/modules/ROOT/pages/PPSummary.adoc">Edit this Page</a></div>
+  
+  <div class="page-downloads">
+  <span class="label">Download as</span>
+  <ul class="download-options">
+    <li>
+      <a onclick="print(this)" href="#" data-toggle="tooltip" data-placement="left" title="Print to PDF"
+         class="pdf-download">
+        <img class="pdf-file-icon icon" src="../_/img/pdf.svg"/> .pdf
+      </a>
+    </li>
+  </ul>
+</div>
+</div>
+
+  <div class="content">
+<aside class="toc sidebar" data-title="Contents" data-levels="2">
+  <div class="toc-menu"></div>
+</aside>
+<article class="doc">
+<div class="paragraph">
+<p><strong>Parallel Programming</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>MPI, OpenMP,…GPU, GPGPU</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>1. CPU, GPU, GPGPU Architecture</strong></p>
+</div>
+<div class="paragraph">
+<p>1.1 CPU</p>
+</div>
+<div class="paragraph">
+<p>1.2 GPU (Graphics Processing Unit is a graphics (co-)processor)</p>
+</div>
+<div class="paragraph">
+<p>1.3 GPGPU (General-Purpose Graphics Processing Unit)</p>
+</div>
+<div class="paragraph">
+<p>1.4 Architecture of a GPU versus CPU</p>
+</div>
+<div class="paragraph">
+<p>1.5 AMD ROCm Platform, CUDA</p>
+</div>
+<div class="paragraph">
+<p>1.5.1 AMD ROC platform</p>
+</div>
+<div class="paragraph">
+<p>1.5.2 CUDA Platform</p>
+</div>
+<div class="paragraph">
+<p>1.5.3 What is the difference between CUDA and ROCm for GPGPU applications?</p>
+</div>
+<div class="paragraph">
+<p>1.6 GPGPU Evolution</p>
+</div>
+<div class="paragraph">
+<p>1.7 TPU (Tensor Processing Unit) form Google</p>
+</div>
+<div class="paragraph">
+<p><strong>2. Programming interface for parallel computing</strong></p>
+</div>
+<div class="paragraph">
+<p>2.1 MPI (Message Passing Interface)</p>
+</div>
+<div class="paragraph">
+<p>2.2 OpenMP (Open Multi-Processing)</p>
+</div>
+<div class="paragraph">
+<p>2.3 Hybrid MPI &amp; OpenMP</p>
+</div>
+<div class="paragraph">
+<p><strong>3.Star PU</strong></p>
+</div>
+<div class="paragraph">
+<p><strong>4.Specx</strong></p>
+</div>
+<div class="paragraph">
+<p>4.1 Specx Workflow</p>
+</div>
+<div class="paragraph">
+<p>4.1.1 Runtime Interface</p>
+</div>
+<div class="paragraph">
+<p>4.1.2 Data dependency interface</p>
+</div>
+<div class="paragraph">
+<p>4.1.3 Task visualization interface</p>
+</div>
+</article>
+  </div>
+</main>
+</div>
+<footer class="footer" style="border-top: 2px solid #e9e9e9; background-color: #fafafa; padding-bottom: 2em; padding-top: 2em;">
+    <div class="container" style="display: flex; flex-direction: column; align-items: center; gap: 0.5em;">
+        <div>
+            <a href="https://www.cemosis.fr">
+                <img src="../_/img/cemosis-logo.svg" alt="Cemosis logo" height="50">
+            </a>
+        </div>
+        <span style="font-size: 0.8rem; color: #9e9e9e">© 2023 <a href="https://www.cemosis.fr" style="text-decoration: underline;">Cemosis</a>, Université de Strasbourg</span>
+    </div>
+</footer>
+<script id="site-script" src="../_/js/site.js" data-ui-root-path="../_"></script>
+
+
+<script async src="../_/js/vendor/fontawesome-icon-defs.js"></script>
+<script async src="../_/js/vendor/fontawesome.js"></script>
+<script async src="../_/js/vendor/highlight.js"></script>
+
+
+<script type="text/javascript">
+function toggleFullScreen() {
+   var doc = window.document;
+   var docEl = doc.documentElement;
+
+   var requestFullScreen = docEl.requestFullscreen || docEl.mozRequestFullScreen || docEl.webkitRequestFullScreen || docEl.msRequestFullscreen;
+   var cancelFullScreen = doc.exitFullscreen || doc.mozCancelFullScreen || doc.webkitExitFullscreen || doc.msExitFullscreen;
+
+   if(!doc.fullscreenElement && !doc.mozFullScreenElement && !doc.webkitFullscreenElement && !doc.msFullscreenElement) {
+       requestFullScreen.call(docEl);
+   }
+   else {
+       cancelFullScreen.call(doc);
+   }
+}
+</script>
+  </body>
+</html>
diff --git a/parallel-programming/ParallelProgramming.html b/parallel-programming/ParallelProgramming.html
index 131ecc9..46b9bce 100755
--- a/parallel-programming/ParallelProgramming.html
+++ b/parallel-programming/ParallelProgramming.html
@@ -143,8 +143,17 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <button class="nav-item-toggle"></button>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
-  <li class="nav-item is-current-page" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
@@ -193,11 +202,6 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
   <button class="nav-toggle"></button>
     <a href="index.html" class="home-link"></a>
   <nav class="breadcrumbs" aria-label="breadcrumbs">
-  <ul>
-    <li><a href="index.html">Template Project</a></li>
-    <li>{Parallel Programming} Environment</li>
-    <li><a href="ParallelProgramming.html">environment</a></li>
-  </ul>
 </nav>
 
   
diff --git a/parallel-programming/antora.html b/parallel-programming/antora.html
index 2548406..5dc015c 100644
--- a/parallel-programming/antora.html
+++ b/parallel-programming/antora.html
@@ -144,7 +144,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/parallel-programming/cmake.html b/parallel-programming/cmake.html
index dccf0f6..9fd2233 100644
--- a/parallel-programming/cmake.html
+++ b/parallel-programming/cmake.html
@@ -144,7 +144,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item is-current-page" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/parallel-programming/githubactions.html b/parallel-programming/githubactions.html
index e180966..2659e1c 100644
--- a/parallel-programming/githubactions.html
+++ b/parallel-programming/githubactions.html
@@ -144,7 +144,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/parallel-programming/index.html b/parallel-programming/index.html
index e6339ee..1843c00 100644
--- a/parallel-programming/index.html
+++ b/parallel-programming/index.html
@@ -145,7 +145,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/parallel-programming/jupyter.html b/parallel-programming/jupyter.html
index 16ac1ad..5c3b2c7 100644
--- a/parallel-programming/jupyter.html
+++ b/parallel-programming/jupyter.html
@@ -144,7 +144,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/parallel-programming/overview.html b/parallel-programming/overview.html
index d20e6da..5f43644 100644
--- a/parallel-programming/overview.html
+++ b/parallel-programming/overview.html
@@ -144,7 +144,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/parallel-programming/quickstart.html b/parallel-programming/quickstart.html
index 5d9d1f7..b5ad48d 100644
--- a/parallel-programming/quickstart.html
+++ b/parallel-programming/quickstart.html
@@ -144,7 +144,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/parallel-programming/rename.html b/parallel-programming/rename.html
index 4640331..75a63f9 100644
--- a/parallel-programming/rename.html
+++ b/parallel-programming/rename.html
@@ -144,7 +144,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/parallel-programming/vscode.html b/parallel-programming/vscode.html
index 861c125..979c344 100644
--- a/parallel-programming/vscode.html
+++ b/parallel-programming/vscode.html
@@ -144,7 +144,16 @@ <h3 class="title"><a href="index.html">Template Project</a></h3>
     <span class="nav-text">{Parallel Programming} Environment</span>
 <ul class="nav-list">
   <li class="nav-item" data-depth="2">
-    <a class="nav-link" href="ParallelProgramming.html">environment</a>
+    <a class="nav-link" href="PPChapter1.html">CPU, GPU, GPGPU Architecture</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter2.html">Programming interface for parallel computing</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter3.html">Star PU</a>
+  </li>
+  <li class="nav-item" data-depth="2">
+    <a class="nav-link" href="PPChapter4.html">Specx</a>
   </li>
   <li class="nav-item" data-depth="2">
     <a class="nav-link" href="cmake.html">cmake environment</a>
diff --git a/sitemap-feelpp-antora-ui.xml b/sitemap-feelpp-antora-ui.xml
index d024e6d..ac4dbd4 100644
--- a/sitemap-feelpp-antora-ui.xml
+++ b/sitemap-feelpp-antora-ui.xml
@@ -2,58 +2,58 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/admonition-styles.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/asciidoc/admonitions.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/asciidoc/bold.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/asciidoc/examples.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/asciidoc/highlight.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/asciidoc/italic.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/asciidoc/monospace.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/copy-to-clipboard.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/index.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/inline-text-styles.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/plotly.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/pseudocode.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/source-highlighting.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/feelpp-antora-ui/style-guide.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 </urlset>
diff --git a/sitemap-parallel-programming.xml b/sitemap-parallel-programming.xml
index 42366de..f073203 100644
--- a/sitemap-parallel-programming.xml
+++ b/sitemap-parallel-programming.xml
@@ -2,42 +2,62 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/antora.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/cmake.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/githubactions.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/index.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/jupyter.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/overview.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/ParallelProgramming.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
+</url>
+<url>
+<loc>https://feelpp.github.io/parallel-programming/parallel-programming/PPChapter1.html</loc>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
+</url>
+<url>
+<loc>https://feelpp.github.io/parallel-programming/parallel-programming/PPChapter2.html</loc>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
+</url>
+<url>
+<loc>https://feelpp.github.io/parallel-programming/parallel-programming/PPChapter3.html</loc>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
+</url>
+<url>
+<loc>https://feelpp.github.io/parallel-programming/parallel-programming/PPChapter4.html</loc>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
+</url>
+<url>
+<loc>https://feelpp.github.io/parallel-programming/parallel-programming/PPSummary.html</loc>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/quickstart.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/rename.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 <url>
 <loc>https://feelpp.github.io/parallel-programming/parallel-programming/vscode.html</loc>
-<lastmod>2023-09-19T12:43:21.906Z</lastmod>
+<lastmod>2023-09-19T13:26:00.269Z</lastmod>
 </url>
 </urlset>

Operators legally allowed in at discount
+ Operator +	+ Initialization value +
+ + +	0
+ * +	1
+ - +	0
+ & +	+ ~0 +

0	+ ^ +
0	+ && +
1